load libraries
PRS****************************************************************************************************************
Input the PRS data
# Read the PRS data
prs_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/PRS_array.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)
Set color palette
group_colors <- c("grey","#3C8C78")
Define your list of traits to analyze
# List of traits to analyze
traits <- c("Heart_rate", "LQTS", "PR_interval", "QTc", "Afib", "HRV", "BP", "QRS", "HCM", "LVEDV","LVEDVI","LVEF","LVESV","LVESVI","SV","SVI","GGE","Focal_Epilepsy")
traits
## [1] "Heart_rate" "LQTS" "PR_interval" "QTc"
## [5] "Afib" "HRV" "BP" "QRS"
## [9] "HCM" "LVEDV" "LVEDVI" "LVEF"
## [13] "LVESV" "LVESVI" "SV" "SVI"
## [17] "GGE" "Focal_Epilepsy"
Z normalize the scores
prs_data <- prs_data %>%
mutate(across(all_of(traits), ~ scale(.x) %>% as.numeric, .names = "Normalized_{.col}"))
Format the data
# Categorize a deprecated input structure
prs_data <- prs_data %>%
mutate(arrest_group = case_when(
arrest_ontology == "control" ~ "Control",
arrest_ontology == "case" ~ "Case",
TRUE ~ as.character(arrest_ontology)
))
# Filter out only 'Control' and 'Case'
filtered_data <- prs_data %>%
filter(arrest_group %in% c("Control", "Case"))
# Generate normalized column names based on the traits list
normalized_traits <- paste("Normalized", traits, sep = "_")
# Filter out non-existing columns from the normalized_traits
existing_normalized_traits <- normalized_traits[normalized_traits %in% names(filtered_data)]
# Reshape the filtered data to long format for the normalized traits
melted_data_normalized <- reshape2::melt(filtered_data,
id.vars = "arrest_group",
measure.vars = existing_normalized_traits)
Split into discovery or replication
prs_data_discovery <- filtered_data %>%
filter(Set == "Discovery")
prs_data_replication <- filtered_data %>%
filter(Set == "Replication")
Make the Correlation plot
# Select only the relevant columns for correlation
data_for_correlation <- prs_data_discovery[, normalized_traits]
#order the traits
corr_traits <- c(
"Normalized_QRS",
"Normalized_BP",
"Normalized_SVI",
"Normalized_LVESV",
"Normalized_PR_interval",
"Normalized_HRV",
"Normalized_GGE",
"Normalized_LVEDV",
"Normalized_LVESVI",
"Normalized_Focal_Epilepsy",
"Normalized_HCM",
"Normalized_LQTS",
"Normalized_Afib",
"Normalized_LVEDVI",
"Normalized_SV",
"Normalized_Heart_rate",
"Normalized_QTc",
"Normalized_LVEF"
)
# Ensure the DataFrame is ordered according to trait preferences
data_for_correlation_ordered <- data_for_correlation[, corr_traits]
# Calculate correlation matrix
cor_matrix_ordered <- cor(data_for_correlation_ordered, use = "complete.obs")
#Plot correlation
corrplot(cor_matrix_ordered,
method = "color",
type = "full",
tl.col = "black",
tl.srt = 45,
cl.ratio = 0.2,
col = colorRampPalette(c("#05618F", "white", "#F0BE3C"))(200),
diag = FALSE,
addgrid.col = "black"
)
#Show the results
correlation_results <- rcorr(as.matrix(data_for_correlation_ordered), type = "pearson")
correlation_results$P
## Normalized_QRS Normalized_BP Normalized_SVI
## Normalized_QRS NA 9.933678e-01 6.853895e-01
## Normalized_BP 9.933678e-01 NA 1.671754e-06
## Normalized_SVI 6.853895e-01 1.671754e-06 NA
## Normalized_LVESV 4.734779e-05 1.440789e-01 5.176039e-08
## Normalized_PR_interval 6.161903e-03 1.510868e-01 3.070402e-01
## Normalized_HRV 5.899446e-06 3.828959e-01 7.641050e-07
## Normalized_GGE 9.659843e-02 1.270667e-01 7.128341e-03
## Normalized_LVEDV 4.954003e-02 1.918542e-01 4.352831e-03
## Normalized_LVESVI 6.277110e-04 5.726864e-01 5.992465e-01
## Normalized_Focal_Epilepsy 9.955037e-02 3.021332e-01 2.018146e-01
## Normalized_HCM 2.396194e-02 7.604039e-01 2.522892e-02
## Normalized_LQTS 5.021915e-02 2.378957e-01 3.264184e-02
## Normalized_Afib 2.837904e-01 7.458152e-01 7.488965e-04
## Normalized_LVEDVI 4.291581e-02 2.605060e-03 1.172185e-05
## Normalized_SV 7.958341e-01 5.007362e-03 1.326023e-03
## Normalized_Heart_rate 7.704693e-04 7.404989e-02 0.000000e+00
## Normalized_QTc 3.611700e-01 3.700776e-05 1.148859e-12
## Normalized_LVEF 1.031027e-01 1.295488e-01 7.635401e-06
## Normalized_LVESV Normalized_PR_interval
## Normalized_QRS 4.734779e-05 0.006161903
## Normalized_BP 1.440789e-01 0.151086783
## Normalized_SVI 5.176039e-08 0.307040155
## Normalized_LVESV NA 0.279786899
## Normalized_PR_interval 2.797869e-01 NA
## Normalized_HRV 1.727377e-03 0.183954907
## Normalized_GGE 2.297310e-01 0.926572064
## Normalized_LVEDV 0.000000e+00 0.326133747
## Normalized_LVESVI 0.000000e+00 0.977482753
## Normalized_Focal_Epilepsy 3.413192e-01 0.873919485
## Normalized_HCM 0.000000e+00 0.779477674
## Normalized_LQTS 4.128755e-02 0.130976701
## Normalized_Afib 5.958125e-04 0.092859152
## Normalized_LVEDVI 0.000000e+00 0.837999134
## Normalized_SV 6.087284e-09 0.126695164
## Normalized_Heart_rate 9.201374e-03 0.151032458
## Normalized_QTc 6.331019e-01 0.389631858
## Normalized_LVEF 0.000000e+00 0.257495908
## Normalized_HRV Normalized_GGE Normalized_LVEDV
## Normalized_QRS 5.899446e-06 9.659843e-02 4.954003e-02
## Normalized_BP 3.828959e-01 1.270667e-01 1.918542e-01
## Normalized_SVI 7.641050e-07 7.128341e-03 4.352831e-03
## Normalized_LVESV 1.727377e-03 2.297310e-01 0.000000e+00
## Normalized_PR_interval 1.839549e-01 9.265721e-01 3.261337e-01
## Normalized_HRV NA 1.417508e-05 7.232691e-01
## Normalized_GGE 1.417508e-05 NA 4.327716e-01
## Normalized_LVEDV 7.232691e-01 4.327716e-01 NA
## Normalized_LVESVI 4.718379e-01 3.926784e-01 0.000000e+00
## Normalized_Focal_Epilepsy 2.515402e-01 9.696364e-01 1.792096e-01
## Normalized_HCM 2.268539e-01 7.662938e-02 9.463941e-11
## Normalized_LQTS 9.602938e-01 6.911754e-01 5.693882e-05
## Normalized_Afib 6.384594e-01 5.193028e-02 4.874951e-01
## Normalized_LVEDVI 3.163305e-01 7.224922e-02 0.000000e+00
## Normalized_SV 5.701852e-01 3.181255e-01 0.000000e+00
## Normalized_Heart_rate 1.551204e-12 1.646706e-01 1.215043e-03
## Normalized_QTc 6.050812e-03 9.795078e-03 9.106381e-03
## Normalized_LVEF 1.320877e-02 3.297949e-01 1.024596e-07
## Normalized_LVESVI Normalized_Focal_Epilepsy
## Normalized_QRS 0.000627711 0.09955037
## Normalized_BP 0.572686369 0.30213319
## Normalized_SVI 0.599246487 0.20181462
## Normalized_LVESV 0.000000000 0.34131916
## Normalized_PR_interval 0.977482753 0.87391948
## Normalized_HRV 0.471837901 0.25154018
## Normalized_GGE 0.392678433 0.96963636
## Normalized_LVEDV 0.000000000 0.17920961
## Normalized_LVESVI NA 0.30285180
## Normalized_Focal_Epilepsy 0.302851796 NA
## Normalized_HCM 0.000000000 0.95432712
## Normalized_LQTS 0.022630206 0.59571578
## Normalized_Afib 0.617610997 0.90444909
## Normalized_LVEDVI 0.000000000 0.70452838
## Normalized_SV 0.016279998 0.71995021
## Normalized_Heart_rate 0.285679168 0.19226455
## Normalized_QTc 0.071460435 0.97448035
## Normalized_LVEF 0.000000000 0.87505014
## Normalized_HCM Normalized_LQTS Normalized_Afib
## Normalized_QRS 2.396194e-02 5.021915e-02 0.2837904158
## Normalized_BP 7.604039e-01 2.378957e-01 0.7458151593
## Normalized_SVI 2.522892e-02 3.264184e-02 0.0007488965
## Normalized_LVESV 0.000000e+00 4.128755e-02 0.0005958125
## Normalized_PR_interval 7.794777e-01 1.309767e-01 0.0928591521
## Normalized_HRV 2.268539e-01 9.602938e-01 0.6384594227
## Normalized_GGE 7.662938e-02 6.911754e-01 0.0519302805
## Normalized_LVEDV 9.463941e-11 5.693882e-05 0.4874950835
## Normalized_LVESVI 0.000000e+00 2.263021e-02 0.6176109965
## Normalized_Focal_Epilepsy 9.543271e-01 5.957158e-01 0.9044490887
## Normalized_HCM NA 2.747912e-01 0.2051656313
## Normalized_LQTS 2.747912e-01 NA 0.6090492099
## Normalized_Afib 2.051656e-01 6.090492e-01 NA
## Normalized_LVEDVI 3.149592e-06 5.104404e-06 0.8024317643
## Normalized_SV 5.644828e-01 2.725126e-06 0.7848328230
## Normalized_Heart_rate 7.267055e-01 2.228441e-01 0.0174432137
## Normalized_QTc 9.979204e-01 0.000000e+00 0.1904777957
## Normalized_LVEF 0.000000e+00 5.014860e-01 0.0219999301
## Normalized_LVEDVI Normalized_SV Normalized_Heart_rate
## Normalized_QRS 4.291581e-02 7.958341e-01 7.704693e-04
## Normalized_BP 2.605060e-03 5.007362e-03 7.404989e-02
## Normalized_SVI 1.172185e-05 1.326023e-03 0.000000e+00
## Normalized_LVESV 0.000000e+00 6.087284e-09 9.201374e-03
## Normalized_PR_interval 8.379991e-01 1.266952e-01 1.510325e-01
## Normalized_HRV 3.163305e-01 5.701852e-01 1.551204e-12
## Normalized_GGE 7.224922e-02 3.181255e-01 1.646706e-01
## Normalized_LVEDV 0.000000e+00 0.000000e+00 1.215043e-03
## Normalized_LVESVI 0.000000e+00 1.628000e-02 2.856792e-01
## Normalized_Focal_Epilepsy 7.045284e-01 7.199502e-01 1.922646e-01
## Normalized_HCM 3.149592e-06 5.644828e-01 7.267055e-01
## Normalized_LQTS 5.104404e-06 2.725126e-06 2.228441e-01
## Normalized_Afib 8.024318e-01 7.848328e-01 1.744321e-02
## Normalized_LVEDVI NA 0.000000e+00 2.187902e-01
## Normalized_SV 0.000000e+00 NA 1.209219e-02
## Normalized_Heart_rate 2.187902e-01 1.209219e-02 NA
## Normalized_QTc 1.142112e-03 4.917063e-06 2.184145e-02
## Normalized_LVEF 3.606922e-03 1.979529e-02 1.680620e-02
## Normalized_QTc Normalized_LVEF
## Normalized_QRS 3.611700e-01 1.031027e-01
## Normalized_BP 3.700776e-05 1.295488e-01
## Normalized_SVI 1.148859e-12 7.635401e-06
## Normalized_LVESV 6.331019e-01 0.000000e+00
## Normalized_PR_interval 3.896319e-01 2.574959e-01
## Normalized_HRV 6.050812e-03 1.320877e-02
## Normalized_GGE 9.795078e-03 3.297949e-01
## Normalized_LVEDV 9.106381e-03 1.024596e-07
## Normalized_LVESVI 7.146043e-02 0.000000e+00
## Normalized_Focal_Epilepsy 9.744803e-01 8.750501e-01
## Normalized_HCM 9.979204e-01 0.000000e+00
## Normalized_LQTS 0.000000e+00 5.014860e-01
## Normalized_Afib 1.904778e-01 2.199993e-02
## Normalized_LVEDVI 1.142112e-03 3.606922e-03
## Normalized_SV 4.917063e-06 1.979529e-02
## Normalized_Heart_rate 2.184145e-02 1.680620e-02
## Normalized_QTc NA 8.490931e-02
## Normalized_LVEF 8.490931e-02 NA
Test for ANOVA assumptions
check_anova_assumptions <- function(data, trait) {
# Ensure 'arrest_group' is a factor
data$arrest_group <- as.factor(data$arrest_group)
# Fit the ANOVA model
formula <- as.formula(paste(trait, "~ arrest_group"))
anova_model <- aov(formula, data = data)
# Extract residuals
residuals <- residuals(anova_model)
# Shapiro-Wilk test for normality
shapiro_test <- shapiro.test(residuals)
shapiro_p_value <- shapiro_test$p.value
# Levene's Test for homogeneity of variances
levene_test <- leveneTest(formula, data = data)
levene_p_value <- levene_test$`Pr(>F)`[1]
# Bartlett's Test for homogeneity of variances
bartlett_test <- bartlett.test(formula, data = data)
bartlett_p_value <- bartlett_test$p.value
# Create a summary table with the test results
data.frame(
Trait = trait,
Shapiro_Wilk_p_value = shapiro_p_value,
Levene_p_value = levene_p_value,
Bartlett_p_value = bartlett_p_value
)
}
anova_assumptions_results <- lapply(normalized_traits, function(trait) check_anova_assumptions(prs_data_discovery, trait))
# Combine the results into a single data frame
anova_assumptions_df <- do.call(rbind, anova_assumptions_results)
# Print the results
print(anova_assumptions_df)
## Trait Shapiro_Wilk_p_value Levene_p_value
## 1 Normalized_Heart_rate 3.670786e-05 3.402430e-01
## 2 Normalized_LQTS 8.582805e-01 8.524520e-01
## 3 Normalized_PR_interval 9.626637e-02 7.067574e-01
## 4 Normalized_QTc 1.378815e-07 4.084220e-01
## 5 Normalized_Afib 5.252293e-01 4.766202e-01
## 6 Normalized_HRV 2.678043e-13 1.221016e-01
## 7 Normalized_BP 2.145452e-02 8.637621e-01
## 8 Normalized_QRS 1.508077e-39 2.042997e-06
## 9 Normalized_HCM 3.144379e-03 9.212937e-01
## 10 Normalized_LVEDV 1.331310e-01 2.218475e-04
## 11 Normalized_LVEDVI 5.919657e-02 7.738241e-03
## 12 Normalized_LVEF 2.471126e-02 3.111787e-01
## 13 Normalized_LVESV 1.169793e-02 9.853473e-01
## 14 Normalized_LVESVI 7.239432e-04 5.086718e-01
## 15 Normalized_SV 1.118507e-05 5.214914e-01
## 16 Normalized_SVI 8.610508e-04 8.519458e-01
## 17 Normalized_GGE 4.448074e-02 6.815913e-01
## 18 Normalized_Focal_Epilepsy 3.025018e-03 6.100802e-01
## Bartlett_p_value
## 1 6.287623e-02
## 2 7.465532e-01
## 3 8.498751e-01
## 4 5.892923e-01
## 5 5.905392e-01
## 6 6.549201e-02
## 7 8.042278e-01
## 8 1.040405e-24
## 9 8.033151e-01
## 10 9.995313e-04
## 11 1.890503e-02
## 12 4.997414e-01
## 13 8.378271e-01
## 14 2.967646e-01
## 15 6.599025e-01
## 16 6.489996e-01
## 17 5.545373e-01
## 18 4.610240e-01
Since some do not meet ANOVA criteria, we go with nonparametric
# Define Wilcoxon Rank Sum test function with median difference calculation
perform_wilcoxon_test <- function(data, trait) {
# Convert data to appropriate format
data <- data %>%
dplyr::select(arrest_group, all_of(trait)) %>%
drop_na()
# Calculate medians for each group
group_medians <- data %>%
group_by(arrest_group) %>%
summarise(median_value = median(!!sym(trait)))
# Calculate the difference in medians
diff_median <- diff(group_medians$median_value)
# Perform Wilcoxon rank-sum test
wilcoxon_test_result <- wilcox_test(data, as.formula(paste(trait, "~ arrest_group")))
# Extract p-value
p_value <- wilcoxon_test_result$p
# Adjust p-value using Bonferroni correction
p_adjusted <- p.adjust(p_value, method = "bonferroni", n = 18)
# Create a data frame with the test results
result_df <- data.frame(
Trait = trait,
p_value = p_value,
p_adjusted = p_adjusted,
significant = p_adjusted < 0.05,
diff_median = diff_median
)
return(result_df)
}
# Perform Wilcoxon test for each trait and store results
wilcoxon_results <- lapply(normalized_traits, function(trait) perform_wilcoxon_test(prs_data_discovery, trait))
# Combine all Wilcoxon results into a single data frame
combined_wilcoxon_results <- do.call(rbind, wilcoxon_results)
# Print combined Wilcoxon results with p-values and median differences
print("Wilcoxon Rank Sum Test Results with Manual P-value Adjustments and Median Differences:")
## [1] "Wilcoxon Rank Sum Test Results with Manual P-value Adjustments and Median Differences:"
print(combined_wilcoxon_results)
## Trait p_value p_adjusted significant diff_median
## 1 Normalized_Heart_rate 1.24e-02 2.2320e-01 FALSE 0.088165794
## 2 Normalized_LQTS 1.91e-01 1.0000e+00 FALSE 0.112902629
## 3 Normalized_PR_interval 2.34e-02 4.2120e-01 FALSE -0.160790733
## 4 Normalized_QTc 4.67e-03 8.4060e-02 FALSE 0.179095083
## 5 Normalized_Afib 1.07e-01 1.0000e+00 FALSE 0.080017842
## 6 Normalized_HRV 1.51e-01 1.0000e+00 FALSE -0.089874654
## 7 Normalized_BP 1.21e-06 2.1780e-05 TRUE -0.275654610
## 8 Normalized_QRS 1.12e-08 2.0160e-07 TRUE -0.128954882
## 9 Normalized_HCM 4.76e-01 1.0000e+00 FALSE 0.031802432
## 10 Normalized_LVEDV 6.38e-01 1.0000e+00 FALSE -0.044351499
## 11 Normalized_LVEDVI 7.37e-02 1.0000e+00 FALSE 0.118958269
## 12 Normalized_LVEF 1.76e-03 3.1680e-02 TRUE 0.124406676
## 13 Normalized_LVESV 7.02e-03 1.2636e-01 FALSE -0.124673369
## 14 Normalized_LVESVI 9.49e-01 1.0000e+00 FALSE -0.011000752
## 15 Normalized_SV 3.55e-02 6.3900e-01 FALSE 0.146894687
## 16 Normalized_SVI 1.77e-04 3.1860e-03 TRUE -0.360642163
## 17 Normalized_GGE 3.20e-01 1.0000e+00 FALSE -0.061833861
## 18 Normalized_Focal_Epilepsy 9.01e-01 1.0000e+00 FALSE 0.001265513
Make the lolliplot
# Define the groups for metrics and syndromes
metrics <- c("Normalized_Heart_rate", "Normalized_PR_interval", "Normalized_QTc", "Normalized_HRV", "Normalized_BP", "Normalized_QRS",
"Normalized_LVEDV", "Normalized_LVEDVI", "Normalized_LVEF", "Normalized_LVESV", "Normalized_LVESVI", "Normalized_SV", "Normalized_SVI")
syndromes <- c("Normalized_LQTS", "Normalized_Afib", "Normalized_HCM", "Normalized_GGE", "Normalized_Focal_Epilepsy")
# Categorize each trait as 'Metrics' or 'Syndromes'
combined_wilcoxon_results$Category <- ifelse(combined_wilcoxon_results$Trait %in% metrics, "Metrics",
ifelse(combined_wilcoxon_results$Trait %in% syndromes, "Syndromes", "Combined"))
# Reorder the levels of Trait based on the Category
combined_wilcoxon_results <- combined_wilcoxon_results %>%
mutate(Trait = factor(Trait, levels = unique(Trait[order(Category)])))
# Transform P-values to -log10 scale
combined_wilcoxon_results$NegLogP <- -log10(combined_wilcoxon_results$p_value)
# Calculate the -log10(0.05) for the reference line
threshold <- -log10(0.05)
threshold2 <- -log10(0.05/18)
# Create a new variable for coloring based on P-value significance (already computed in the original df)
combined_wilcoxon_results$Significant <- ifelse(combined_wilcoxon_results$p_value < 0.05, "Significant", "Not Significant")
# Combine metrics and syndromes into a single ordered list
ordered_traits <- c(
"Normalized_QRS",
"Normalized_BP",
"Normalized_SVI",
"Normalized_LVESV",
"Normalized_PR_interval",
"Normalized_HRV",
"Normalized_GGE",
"Normalized_LVEDV",
"Normalized_LVESVI",
"Normalized_Focal_Epilepsy",
"Normalized_HCM",
"Normalized_LQTS",
"Normalized_Afib",
"Normalized_LVEDVI",
"Normalized_SV",
"Normalized_Heart_rate",
"Normalized_QTc",
"Normalized_LVEF"
)
# Update the 'Trait' column to have this specific order
combined_wilcoxon_results$Trait <- factor(combined_wilcoxon_results$Trait, levels = ordered_traits)
# Create the plot, coloring by diff_median
p3 <- ggplot(combined_wilcoxon_results, aes(x = Trait, y = NegLogP, color = diff_median)) +
geom_segment(aes(xend = Trait, yend = 0), size = 1) +
geom_point(size = 3) +
geom_hline(yintercept = threshold, linetype = "dotted", color = "Black") +
geom_hline(yintercept = threshold2, linetype = "dotted", color = "Red") +
scale_color_gradient2(low = "#3C8C78", mid = "white", high = "black", midpoint = 0) +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
strip.text.x = element_text(size = 10)) +
labs(y = "-log10(P-value)", x = "Trait", color = "Median Difference", title = "Lollipop Plot of -log10(P-values) by Trait")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the plot
print(p3)
# Function to generate plots for a given trait
generate_plots_for_trait <- function(data, trait, group_colors) {
# Rank the scores for the specified trait
data$rank <- rank(data[[trait]], na.last = "keep")
# Density plot for the trait
density_plot <- ggplot(data, aes(x = rank, fill = arrest_group, y = after_stat(density))) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = group_colors) +
labs(title = paste("Relative Density of Overall", trait, "Trait Ranks Across Arrest Groups"),
x = paste("Overall Rank of", trait, "Scores"),
y = "Relative Density") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# CDF plot for the trait
cdf_plot <- ggplot(data, aes(x = rank, color = arrest_group)) +
stat_ecdf(geom = "step", size = 1) +
scale_color_manual(values = group_colors) +
labs(title = paste("CDF of Overall", trait, "Trait Ranks Across Arrest Groups"),
x = paste("Overall Rank of", trait, "Scores"),
y = "Cumulative Proportion") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Combine the plots
combined_plot <- plot_grid(density_plot, cdf_plot, ncol = 1, align = 'v', labels = c("A", "B"))
# Return the combined plot
return(combined_plot)
}
# List of traits to plot
traits <- c("BP", "Afib", "QRS","LVEF","Heart_rate")
# Initialize an empty list to store plots
plots_list <- list()
# Loop through each trait and generate plots, storing them in the list
for(trait in traits) {
plots_list[[trait]] <- generate_plots_for_trait(prs_data_discovery, trait, group_colors)
}
######## Access each plot by its trait name from the plots_list ##########
# For example, to print the plot for QRS, use:
print(plots_list[["QRS"]])
print(plots_list[["BP"]])
Coding regions******************************************************************************************************************
Bring in data
coding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/statistics_by_individual_replication.txt", header = TRUE, sep = "\t")
coding_stats_discovery <- coding_stats[coding_stats$Replication == 0, ]
Categorize the data
categorized_coding_stats_discovery <- coding_stats_discovery %>%
mutate(Category_Group = case_when(
Category == "control" ~ "Control",
Category == "case" ~ "Case",
TRUE ~ NA_character_
))
Merge some of the column data to get the desired allele frequency intervals
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
mutate(
Epilepsy_01_00001 = Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001,
Epilepsy_00001 = Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton,
Epilepsy_total_missense = Epilepsy_missense_common + Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001 +
Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton,
Cardiomyopathy_01_00001 = Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001,
Cardiomyopathy_00001 = Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton,
Cardiomyopathy_total_missense = Cardiomyopathy_missense_common + Cardiomyopathy_missense_variant_0.01 +
Cardiomyopathy_missense_variant_0.001 + Cardiomyopathy_missense_variant_0.00001 +
Cardiomyopathy_missense_singleton
)
# Clean up the data: remove 'NA's, 'Inf's, and select only needed columns
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
dplyr::select(-ID, -Category) %>%
na.omit() %>%
dplyr::filter(if_all(everything(), ~ !is.infinite(.)))
# Aggregate the data to compute mean and standard deviation
collapsed_coding_stats <- categorized_coding_stats_discovery %>%
group_by(Category_Group) %>%
summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE),
std = ~sd(., na.rm = TRUE))))
# Clean up the collapsed data
collapsed_coding_stats <- na.omit(collapsed_coding_stats)
Perform permutation simulation and test
# List of coding variables to analyze
coding_variables_to_analyze <- c("Cardiomyopathy_HIGH", "Cardiomyopathy_start_lost",
"Cardiomyopathy_stop_gained", "Cardiomyopathy_01_00001",
"Cardiomyopathy_00001", "Cardiomyopathy_total_missense", "PLP_Cardiomyopathy",
"Epilepsy_HIGH", "Epilepsy_start_lost",
"Epilepsy_stop_gained", "Epilepsy_01_00001",
"Epilepsy_00001", "Epilepsy_total_missense", "PLP_Epilepsy")
# Define the permutation function
permutation_test <- function(data, var, group_var, num_permutations = 10000) {
# Check for sufficient group sizes
if (sum(data[[group_var]] == "Case", na.rm = TRUE) > 1 &&
sum(data[[group_var]] == "Control", na.rm = TRUE) > 1) {
# Calculate the observed difference in means between the two groups
observed_stat <- abs(mean(data[[var]][data[[group_var]] == "Case"], na.rm = TRUE) -
mean(data[[var]][data[[group_var]] == "Control"], na.rm = TRUE))
# Create an empty vector to store the permutation statistics
perm_stats <- numeric(num_permutations)
# Perform the permutations
for (i in 1:num_permutations) {
# Randomly shuffle the group labels
permuted_group <- sample(data[[group_var]])
# Calculate the difference in means for the permuted data
permuted_stat <- abs(mean(data[[var]][permuted_group == "Case"], na.rm = TRUE) -
mean(data[[var]][permuted_group == "Control"], na.rm = TRUE))
# Store the permuted statistic
perm_stats[i] <- permuted_stat
}
# Calculate the p-value (proportion of permuted stats >= observed stat)
p_value <- mean(perm_stats >= observed_stat)
return(list(observed_stat = observed_stat, perm_stats = perm_stats, p_value = p_value))
} else {
warning(paste("Skipping variable", var, "due to insufficient group size"))
return(NULL)
}
}
# Initialize a dataframe to store p-values for each variable
p_value_results <- data.frame(Variable = character(), Observed_Stat = numeric(), p_value = numeric(), stringsAsFactors = FALSE)
# Loop through each variable and perform the permutation test
for (var in coding_variables_to_analyze) {
# Run the permutation test
test_result <- permutation_test(categorized_coding_stats_discovery, var, "Category_Group")
if (!is.null(test_result)) {
# Extract the permuted statistics, observed statistic, and p-value
perm_stats <- test_result$perm_stats
observed_stat <- test_result$observed_stat
p_value <- test_result$p_value
# Store the p-value and observed statistic in the dataframe
p_value_results <- rbind(p_value_results, data.frame(Variable = var,
Observed_Stat = observed_stat,
p_value = p_value))
# Create a data frame for plotting the permutation distribution
perm_df <- data.frame(perm_stats = perm_stats)
# Plot the permutation distribution with the observed statistic marked and p-value
p <- ggplot(perm_df, aes(x = perm_stats)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
geom_vline(aes(xintercept = observed_stat), color = "red", linetype = "dashed", size = 1) +
labs(title = paste("Permutation Test for", var),
x = "Permuted Statistic",
y = "Frequency",
subtitle = paste("Observed Statistic =", round(observed_stat, 4),
"| P-value =", format(p_value, digits = 4))) + # Add p-value to subtitle
theme_minimal()
# Print the plot
print(p)
}
}
# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
## Variable Observed_Stat p_value
## 1 Cardiomyopathy_HIGH 0.67487504 0.0001
## 2 Cardiomyopathy_start_lost 0.03608007 0.0007
## 3 Cardiomyopathy_stop_gained 0.27295893 0.0000
## 4 Cardiomyopathy_01_00001 2.19304371 0.0000
## 5 Cardiomyopathy_00001 0.55399147 0.0015
## 6 Cardiomyopathy_total_missense 8.02403705 0.0000
## 7 PLP_Cardiomyopathy 0.19359502 0.0000
## 8 Epilepsy_HIGH 0.70615260 0.0104
## 9 Epilepsy_start_lost 0.01129570 0.0509
## 10 Epilepsy_stop_gained 0.34350191 0.0000
## 11 Epilepsy_01_00001 1.45746349 0.0000
## 12 Epilepsy_00001 0.56937910 0.0045
## 13 Epilepsy_total_missense 2.48950064 0.0001
## 14 PLP_Epilepsy 0.01211653 0.5146
Create the Function to plot each column as a ratio to the means from Group 1
plot_column_ratio <- function(data, col_name) {
# Calculate the mean and standard deviation for Control
group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
# Calculate the ratio for each group relative to Group 1
data_summary <- data %>%
group_by(Category_Group) %>%
summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
sd_value = sd(.data[[col_name]], na.rm = TRUE),
n = n()) %>%
mutate(ratio = mean_value / group1_mean,
sem_value = sd_value / sqrt(n),
sem_ratio = sem_value / group1_mean, # SEM scaled to the group 1, just like the mean value
ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI
return(list(summary_data = data_summary))
}
Plot the data relative to control
# Initialize an empty dataframe to store summary data
combined_coding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean = numeric(), std = numeric(), stringsAsFactors = FALSE)
# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_coding_stats_discovery), c("ID", "Category", "Category_Group"))
# Loop through each column and plot relative to Category_Group1 (Crontrol)
for (col in columns_to_plot) {
plot_data <- plot_column_ratio(categorized_coding_stats_discovery, col)
# Append summary data to combined_coding_stats_summary_df
combined_coding_stats_summary_df <- bind_rows(combined_coding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}
# Subset the data for the specified variables
selected_coding_data <- combined_coding_stats_summary_df %>%
filter(col_name %in% c(
"Cardiomyopathy_total_missense",
"Cardiomyopathy_01_00001",
"Cardiomyopathy_00001",
"Epilepsy_total_missense",
"Epilepsy_01_00001",
"Epilepsy_00001",
"PLP_Cardiomyopathy",
"PLP_Epilepsy",
"Cardiomyopathy_start_lost",
"Cardiomyopathy_stop_gained",
"Cardiomyopathy_HIGH",
"Epilepsy_start_lost",
"Epilepsy_HIGH",
"Epilepsy_stop_gained"
),
Category_Group != "Control")
# Define the specific order for "Epilepsy" and CMAR variants
levels_order <- c(
"PLP_Epilepsy",
"Epilepsy_00001",
"Epilepsy_01_00001",
"Epilepsy_total_missense",
"Epilepsy_start_lost",
"Epilepsy_stop_gained",
"Epilepsy_HIGH",
"PLP_Cardiomyopathy",
"Cardiomyopathy_00001",
"Cardiomyopathy_01_00001",
"Cardiomyopathy_total_missense",
"Cardiomyopathy_start_lost",
"Cardiomyopathy_stop_gained",
"Cardiomyopathy_HIGH"
)
selected_coding_data$col_name <- factor(selected_coding_data$col_name, levels = levels_order)
# Plot
coding_stats_plot <- ggplot(selected_coding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
geom_point(position = position_dodge(width = 1), size = 3) +
geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 1) +
geom_vline(xintercept = 1, linetype = "dashed") +
scale_color_manual(values = "#3C8C78") +
labs(title = "Ratio Compared to Control +/-SEM",
y = "Variant",
x = "Ratio to Control Mean",
color = "Category Group") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8, hjust = 1),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8),
plot.title = element_text(size = 8, hjust = 0.5),
plot.subtitle = element_text(size = 8),
plot.caption = element_text(size = 8),
plot.margin = margin(15, 15, 15, 15)) +
scale_x_continuous(limits = c(-2, 12))
print(coding_stats_plot)
# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
## Variable Observed_Stat p_value
## 1 Cardiomyopathy_HIGH 0.67487504 0.0001
## 2 Cardiomyopathy_start_lost 0.03608007 0.0007
## 3 Cardiomyopathy_stop_gained 0.27295893 0.0000
## 4 Cardiomyopathy_01_00001 2.19304371 0.0000
## 5 Cardiomyopathy_00001 0.55399147 0.0015
## 6 Cardiomyopathy_total_missense 8.02403705 0.0000
## 7 PLP_Cardiomyopathy 0.19359502 0.0000
## 8 Epilepsy_HIGH 0.70615260 0.0104
## 9 Epilepsy_start_lost 0.01129570 0.0509
## 10 Epilepsy_stop_gained 0.34350191 0.0000
## 11 Epilepsy_01_00001 1.45746349 0.0000
## 12 Epilepsy_00001 0.56937910 0.0045
## 13 Epilepsy_total_missense 2.48950064 0.0001
## 14 PLP_Epilepsy 0.01211653 0.5146
Input the data
# Pull in the gene data
gene_data <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_variants_by_gene.csv")
# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort.txt", sep = "\t", header = TRUE)
# add the arrest category to each
gene_data$Category <- cohort$arrest_ontology[match(gene_data$ID, cohort$CGM_id)]
gene_data_discovery <- gene_data %>%
filter(Set == "Discovery")
Summarize the data by GENE, counting the number of variants for each gene Filter for Cases and then group and summarize
variants_per_gene_Case <- gene_data_discovery %>%
filter(Category == "case") %>%
group_by(GENE) %>%
summarise(
HIGH = sum(HIGH, na.rm = TRUE),
PLP = sum(PLP, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Case)
## # A tibble: 356 × 3
## GENE HIGH PLP
## <chr> <int> <int>
## 1 A2ML1 57 0
## 2 AARS 6 0
## 3 ABAT 0 0
## 4 ABCC9 4 0
## 5 ACADVL 4 0
## 6 ACTC1 1 0
## 7 ACTL6B 0 0
## 8 ACTN2 7 0
## 9 ADAM22 3 0
## 10 ADSL 1 1
## # ℹ 346 more rows
Filter for Controls and then group and summarize
variants_per_gene_Control <- gene_data_discovery %>%
filter(Category == "control") %>%
group_by(GENE) %>%
summarise(
HIGH = sum(HIGH, na.rm = TRUE),
PLP = sum(PLP, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Control)
## # A tibble: 341 × 3
## GENE HIGH PLP
## <chr> <int> <int>
## 1 A2ML1 69 0
## 2 AARS 0 0
## 3 ABAT 0 0
## 4 ABCC9 3 0
## 5 ACADVL 6 3
## 6 ACTL6B 0 2
## 7 ACTN2 1 0
## 8 ADAM22 0 0
## 9 ADSL 0 0
## 10 AGL 0 0
## # ℹ 331 more rows
Load gene lists from text files
genes_CMAR1 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR1_list.txt")
genes_CMAR2 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR2_list.txt")
genes_EIEE_OMIM <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_EIEE_OMIM_list.txt")
genes_Epilepsy <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_Epilepsy_list.txt")
Annotate gene with source panel
genes_CMAR1 <- data.frame(Gene = genes_CMAR1, Source = "Cardiomyopathy")
genes_CMAR2 <- data.frame(Gene = genes_CMAR2, Source = "Cardiomyopathy")
genes_EIEE_OMIM <- data.frame(Gene = genes_EIEE_OMIM, Source = "Epilepsy")
genes_Epilepsy <- data.frame(Gene = genes_Epilepsy, Source = "Epilepsy")
# Replace "\"AARS1\"" with "AARS"
genes_EIEE_OMIM$Gene <- ifelse(genes_EIEE_OMIM$Gene == "\"AARS1\"", "AARS", genes_EIEE_OMIM$Gene)
# Replace "\"KIAA2022\"" with "NEXMIF"
genes_Epilepsy$Gene <- ifelse(genes_Epilepsy$Gene == "\"NEXMIF\"", "KIAA2022", genes_Epilepsy$Gene)
Append panel source to the gene_data
# Combine all lists into one dataframe
all_genes <- rbind(genes_CMAR1, genes_CMAR2, genes_EIEE_OMIM, genes_Epilepsy)
# Sort by Gene and Source to ensure "Cardiomyopathy" comes first
all_genes <- all_genes[order(all_genes$Gene, all_genes$Source), ]
# Remove duplicates, keeping the first occurrence
all_genes <- all_genes[!duplicated(all_genes$Gene), ]
# Clean the 'Gene' column in 'all_genes' by removing quotes and backslashes
all_genes$Gene <- gsub('\"', '', all_genes$Gene)
# Ensure that the 'GENE' column in 'gene_data_discovery' and 'Gene' in 'all_genes' are character
gene_data_discovery$GENE <- as.character(gene_data_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)
# find the index of each gene in 'all_genes' and use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_discovery$Source <- all_genes$Source[match(gene_data_discovery$GENE, all_genes$Gene)]
# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'
# View the first few rows to verify the new 'Source' has been added
head(gene_data_discovery)
## ID GENE HIGH PLP Set Category Source
## 1 CGM0000001 A2ML1 0 0 Discovery control Cardiomyopathy
## 2 CGM0000001 ABAT 0 0 Discovery control Epilepsy
## 3 CGM0000001 ADAM22 0 0 Discovery control Epilepsy
## 4 CGM0000001 AKAP9 0 0 Discovery control Cardiomyopathy
## 5 CGM0000001 ALDH5A1 0 0 Discovery control Epilepsy
## 6 CGM0000001 ALMS1 0 0 Discovery control Cardiomyopathy
Add the source to the control variants list
variants_per_gene_Control$Source <- all_genes$Source[match(variants_per_gene_Control$GENE, all_genes$Gene)]
head(variants_per_gene_Control)
## # A tibble: 6 × 4
## GENE HIGH PLP Source
## <chr> <int> <int> <chr>
## 1 A2ML1 69 0 Cardiomyopathy
## 2 AARS 0 0 Epilepsy
## 3 ABAT 0 0 Epilepsy
## 4 ABCC9 3 0 Cardiomyopathy
## 5 ACADVL 6 3 Cardiomyopathy
## 6 ACTL6B 0 2 Epilepsy
Add the source to the Case variants list
variants_per_gene_Case$Source <- all_genes$Source[match(variants_per_gene_Case$GENE, all_genes$Gene)]
head(variants_per_gene_Case)
## # A tibble: 6 × 4
## GENE HIGH PLP Source
## <chr> <int> <int> <chr>
## 1 A2ML1 57 0 Cardiomyopathy
## 2 AARS 6 0 Epilepsy
## 3 ABAT 0 0 Epilepsy
## 4 ABCC9 4 0 Cardiomyopathy
## 5 ACADVL 4 0 Cardiomyopathy
## 6 ACTC1 1 0 Cardiomyopathy
combine the variants together now
# Add a new column to indicate the category
variants_per_gene_Control <- variants_per_gene_Control %>%
mutate(Category = "Control")
variants_per_gene_Case <- variants_per_gene_Case %>%
mutate(Category = "Case")
# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Control, variants_per_gene_Case)
# Print the combined dataset
print(combined_variants)
## # A tibble: 697 × 5
## GENE HIGH PLP Source Category
## <chr> <int> <int> <chr> <chr>
## 1 A2ML1 69 0 Cardiomyopathy Control
## 2 AARS 0 0 Epilepsy Control
## 3 ABAT 0 0 Epilepsy Control
## 4 ABCC9 3 0 Cardiomyopathy Control
## 5 ACADVL 6 3 Cardiomyopathy Control
## 6 ACTL6B 0 2 Epilepsy Control
## 7 ACTN2 1 0 Cardiomyopathy Control
## 8 ADAM22 0 0 Epilepsy Control
## 9 ADSL 0 0 Epilepsy Control
## 10 AGL 0 0 Cardiomyopathy Control
## # ℹ 687 more rows
Plot the number of variant types in cases and controls
# Filter dataset where High > 0
combined_variants_High <- combined_variants %>% filter(HIGH > 0)
High_variant_plot <- ggplot(combined_variants_High, aes(x = GENE, y = Category, fill = HIGH)) +
geom_tile() +
scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"),
values = scales::rescale(c(0, 0.5, 1))) +
facet_wrap(~ Source, scales = "free_x", ncol = 1) +
labs(title = "High Variants Heatmap",
x = "Gene",
y = "Category",
fill = "Count") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.background = element_blank(),
strip.placement = "outside")
# Print the plot
print(High_variant_plot)
Plot the number of variant types in cases and controls
# Filter datasets where PLP > 0
combined_variants_PLP <- combined_variants %>% filter(PLP > 0)
# Create the PLP plot
PLP_plot <- ggplot(combined_variants_PLP, aes(x = GENE, y = Category, fill = PLP)) +
geom_tile() +
scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"),
values = scales::rescale(c(0, 0.5, 1))) +
facet_wrap(~ Source, scales = "free_x", ncol = 1) +
labs(title = "PLP Heatmap",
x = "Gene",
y = "Category",
fill = "Count") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.background = element_blank(),
strip.placement = "outside")
# Print the plot
print(PLP_plot)
Now bring in the module data
modules <- read_excel("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/cardio_gene_sets.xlsx")
annotate the gene_data with the modules
module_data <- left_join(gene_data_discovery, modules, by = c("GENE" = "Gene"))
# View the first few rows of the updated data
head(module_data)
## ID GENE HIGH PLP Set Category Source
## 1 CGM0000001 A2ML1 0 0 Discovery control Cardiomyopathy
## 2 CGM0000001 ABAT 0 0 Discovery control Epilepsy
## 3 CGM0000001 ADAM22 0 0 Discovery control Epilepsy
## 4 CGM0000001 AKAP9 0 0 Discovery control Cardiomyopathy
## 5 CGM0000001 ALDH5A1 0 0 Discovery control Epilepsy
## 6 CGM0000001 ALMS1 0 0 Discovery control Cardiomyopathy
## Module
## 1 Rasopathy
## 2 <NA>
## 3 <NA>
## 4 Membrane_polarity
## 5 <NA>
## 6 fate_specification
NOW PLOT THE PLPs by their expert-defined categories. The size is the relative number of variants per gene in the category. the Color is the absolute number of variants
#Filter NAs ans aggregate the data
module_data <- module_data %>%
filter(!is.na(Module))
module_data <- module_data %>%
mutate(Category_Group = ifelse(Category == "Control", "control", "case"))
# Count up the variants
module_summary <- module_data %>%
group_by(Module, Category_Group) %>%
summarise(Total_PLP_variant = sum(PLP, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'Module'. You can override using the
## `.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
group_by(Module) %>%
summarise(Num_Genes = n()) %>%
ungroup()
# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
left_join(genes_per_module, by = c("Module" = "Module"))
# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
mutate(Relative_Size = Total_PLP_variant / Num_Genes)
# Filter the data to only include "Case"
module_summary_filtered <- module_summary %>%
filter(Category_Group == "case")
module_order <- module_summary_filtered %>%
group_by(Module) %>%
summarise(Sort_Metric = mean(Total_PLP_variant, na.rm = TRUE)) %>%
arrange(desc(Sort_Metric)) %>%
.$Module
module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)
# Now plot with the reordered Module
modules_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_PLP_variant, color = Relative_Size)) +
geom_point(shape = 15, alpha = 1) +
scale_size(range = c(3, 20), name = "Number of PLPs") +
scale_color_gradient(low = "Grey", high = "#05618F", name = "# of PLPs relative to Module size") +
theme_minimal() +
labs(title = "Total PLP by Module (Cases)",
x = "Module",
y = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(modules_plot)
Build the interaction plot data
#Count the High effect variants
aggregated_data <- module_data %>%
group_by(ID, Module, Category) %>%
summarise(High_count = sum(HIGH),
.groups = 'drop')
#split off controls
data_control <- aggregated_data %>%
filter(Category %in% c("control"))
#identify IDs with any 'High_count' greater than 0
ids_with_both_1 <- data_control %>%
group_by(ID) %>%
summarise(High_count_any = any(High_count > 0)) %>%
filter(High_count_any) %>%
dplyr::select(ID)
# Filter to include only rows that have IDs with High_count > 0
modules_with_both_1 <- data_control %>%
semi_join(ids_with_both_1, by = "ID")
#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_1 <- modules_with_both_1 %>%
group_by(ID) %>%
do({
data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
}) %>%
dplyr::rename(Module1 = X1, Module2 = X2) %>%
left_join(modules_with_both_1, by = c("ID", "Module1" = "Module")) %>%
left_join(modules_with_both_1, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
filter(High_count_2 > 0 | High_count_1 > 0)
# Count occurrences of module pairs across IDs
module_pair_counts_1 <- module_pairs_for_ids_1 %>%
group_by(Module1, Module2) %>%
summarise(Count = n_distinct(ID), .groups = 'drop')
#split off cases
data_case <- aggregated_data %>%
filter(Category %in% c("case"))
#identify IDs with any 'High_count' greater than 0
ids_with_both_case <- data_case %>%
group_by(ID) %>%
summarise(Missense_any = any(High_count > 0)) %>%
filter(Missense_any) %>%
dplyr::select(ID)
# Filter to include only rows that have IDs with High_count > 0
modules_with_both_case <- data_case %>%
semi_join(ids_with_both_case, by = "ID")
#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_case <- modules_with_both_case %>%
group_by(ID) %>%
do({
data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
}) %>%
dplyr::rename(Module1 = X1, Module2 = X2) %>%
left_join(modules_with_both_case, by = c("ID", "Module1" = "Module")) %>%
left_join(modules_with_both_case, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
filter(High_count_2 > 0 | High_count_1 > 0)
# Count occurrences of module pairs across IDs
module_pair_counts_case <- module_pairs_for_ids_case %>%
group_by(Module1, Module2) %>%
summarise(Count = n_distinct(ID), .groups = 'drop')
# Ensure all combinations are present with at least a zero count for each category
comparison_df <- merge(module_pair_counts_1, module_pair_counts_case, by = c("Module1", "Module2"), all = TRUE)
comparison_df[is.na(comparison_df)] <- 0 # Replace NA with 0
# Rename columns
names(comparison_df)[names(comparison_df) == "Count.x"] <- "Count_control"
names(comparison_df)[names(comparison_df) == "Count.y"] <- "Count_case"
# Add number of people per group for totals not in counts
comparison_df$Not_Count_control <- sum(cohort$arrest_ontology == "control") - comparison_df$Count_control
comparison_df$Not_Count_case <- sum(cohort$arrest_ontology %in% c("case")) - comparison_df$Count_case
# Function to perform Fisher's Exact Test for a row
perform_fisher_test <- function(Count_control, not_Count_control, count_case, not_count_case) {
contingency_table <- matrix(c(Count_control, not_Count_control, count_case, not_count_case),
nrow = 2,
dimnames = list(c("In Count", "Not in Count"), c("Group_1", "Group_case")))
fisher_result <- fisher.test(contingency_table)
return(fisher_result$p.value)
}
# Apply the Fisher function to each row in the dataframe
comparison_df$p_value <- mapply(perform_fisher_test,
comparison_df$Count_control,
comparison_df$Not_Count_control,
comparison_df$Count_case,
comparison_df$Not_Count_case)
comparison_df$adjusted_p_value <- p.adjust(comparison_df$p_value, method = "bonferroni", n = length(comparison_df$p_value))
# Filter significant results based on adjusted p-values
significant_pairs <- comparison_df %>% filter(adjusted_p_value < 0.05)
# Print significant module pairs
print(significant_pairs)
## Module1 Module2 Count_control Count_case
## 1 CICR sarcomere 68 104
## 2 DGC fate_specification 5 28
## 3 DGC ICD 23 51
## 4 DGC Membrane_polarity 14 41
## 5 DGC Proteostasis 4 24
## 6 DGC sarcomere 27 68
## 7 fate_specification mitochondria 15 37
## 8 fate_specification nuclear_envelope 2 17
## 9 fate_specification sarcomere 28 73
## 10 ICD fate_specification 24 57
## 11 ICD Membrane_polarity 33 68
## 12 ICD Proteostasis 23 55
## 13 ICD sarcomere 45 95
## 14 lysosomal_storage sarcomere 25 61
## 15 Membrane_polarity fate_specification 15 46
## 16 Membrane_polarity lysosomal_storage 12 35
## 17 Membrane_polarity mitochondria 24 50
## 18 Membrane_polarity Proteostasis 14 43
## 19 Membrane_polarity sarcomere 37 83
## 20 mitochondria sarcomere 37 77
## 21 Proteostasis fate_specification 5 31
## 22 Proteostasis sarcomere 26 72
## 23 Rasopathy sarcomere 92 127
## Not_Count_control Not_Count_case p_value adjusted_p_value
## 1 528 419 9.184664e-05 8.358044e-03
## 2 591 495 7.980239e-06 7.262018e-04
## 3 573 472 9.256654e-05 8.423555e-03
## 4 582 482 2.248122e-05 2.045791e-03
## 5 592 499 2.376165e-05 2.162310e-03
## 6 569 455 4.739262e-07 4.312729e-05
## 7 581 486 3.201544e-04 2.913405e-02
## 8 594 506 2.162122e-04 1.967531e-02
## 9 568 450 6.060785e-08 5.515314e-06
## 10 572 466 1.366726e-05 1.243721e-03
## 11 563 455 1.500246e-05 1.365223e-03
## 12 573 468 1.660428e-05 1.510989e-03
## 13 551 428 1.046357e-07 9.521851e-06
## 14 571 462 2.895091e-06 2.634532e-04
## 15 581 477 4.241213e-06 3.859504e-04
## 16 584 488 1.330254e-04 1.210531e-02
## 17 572 473 2.536897e-04 2.308576e-02
## 18 582 480 8.210955e-06 7.471969e-04
## 19 559 440 1.988805e-07 1.809813e-05
## 20 559 446 2.651718e-06 2.413064e-04
## 21 591 492 1.281642e-06 1.166294e-04
## 22 570 451 3.284151e-08 2.988577e-06
## 23 504 396 2.121104e-04 1.930205e-02
Make the plot for module makeup
#merge the high effect count data
High_data <- rbind(data_control, data_case)
#assign in the module gene count data
df_gene_counts <- data.frame(
Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis",
"Rasopathy", "SNS_PNS", "Z_disc", "cytokine",
"fate_specification", "lysosomal_storage", "mitochondria",
"nuclear_envelope", "sarcomere"),
Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)
# scale the high effect date per module based on the genes in the module
High_data_scaled <- High_data %>%
left_join(df_gene_counts, by = "Module")
High_data_scaled <- High_data_scaled %>%
mutate(High_count_per_gene = High_count / Num_Genes)
#compute the means
averages_sem_scaled <- High_data_scaled %>%
dplyr::group_by(Module, Category) %>%
dplyr::summarize(
Mean = mean(High_count_per_gene, na.rm = TRUE),
SD = sd(High_count_per_gene, na.rm = TRUE),
N = n(),
SEM = SD / sqrt(N),
.groups = 'drop'
)
#run the t-tests to compare the modules per case or control (Category)
test_results <- High_data_scaled %>%
group_by(Module) %>%
do({
ttest <- t.test(High_count_per_gene ~ Category, data = .)
tidy(ttest)
}) %>%
ungroup()
# show t-test data
print(test_results)
## # A tibble: 14 × 11
## Module estimate estimate1 estimate2 statistic p.value parameter conf.low
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CICR 6.86e-3 0.0185 0.0117 1.83 6.77e-2 712. -5.01e-4
## 2 DGC 3.59e-3 0.00439 0.000798 2.49 1.31e-2 637. 7.55e-4
## 3 ICD 7.94e-3 0.0135 0.00559 2.34 1.96e-2 702. 1.27e-3
## 4 Membrane_p… 6.76e-3 0.00777 0.00101 3.95 8.96e-5 483. 3.40e-3
## 5 Proteostas… 4.79e-3 0.00533 0.000532 3.01 2.75e-3 509. 1.66e-3
## 6 Rasopathy 3.10e-3 0.0111 0.00800 2.06 3.93e-2 897. 1.52e-4
## 7 SNS_PNS 2.33e-3 0.00696 0.00463 1.04 3.00e-1 621. -2.07e-3
## 8 Z_disc -1.02e-2 0.0478 0.0580 -1.39 1.65e-1 783. -2.45e-2
## 9 cytokine -2.88e-2 0.0866 0.115 -3.09 2.06e-3 984. -4.71e-2
## 10 fate_speci… 6.87e-3 0.00738 0.000508 3.41 7.18e-4 475. 2.90e-3
## 11 lysosomal_… 2.17e-4 0.00236 0.00214 0.118 9.06e-1 871. -3.38e-3
## 12 mitochondr… 1.84e-3 0.00281 0.000968 2.37 1.83e-2 596. 3.12e-4
## 13 nuclear_en… 3.13e-3 0.00313 0 2.01 4.53e-2 318 6.56e-5
## 14 sarcomere 1.99e-2 0.0230 0.00314 4.59 5.60e-6 475. 1.14e-2
## # ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
#calculate averages
averages_sem_scaled <- High_data_scaled %>%
dplyr::group_by(Module, Category) %>%
dplyr::summarize(
Mean = mean(High_count_per_gene),
SD = sd(High_count_per_gene),
N = n(),
SEM = SD / sqrt(N),
.groups = 'drop'
)
# Define fill colors and labels for the updated categories
custom_fill_colors <- c("control" = "#FF6464", "case" = "#05618F")
custom_fill_labels <- c("control" = "Control", "case" = "Case")
# Plot
High_modules_plot_scaled <- ggplot(averages_sem_scaled, aes(x = Module, y = Mean, fill = Category, group = Category)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8), color = "black") +
geom_errorbar(aes(ymin = Mean - SEM, ymax = Mean + SEM),
width = 0.2, position = position_dodge(0.8)) +
scale_fill_manual(values = custom_fill_colors, labels = custom_fill_labels) +
labs(x = "Module", y = "Average High Count Per Gene", fill = "Category") +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(High_modules_plot_scaled)
Plot the interaction network
#compute the relative counts based on cohort size
comparison_df <- comparison_df %>%
mutate(Relative_Counts = (Count_case / (sum(cohort$arrest_ontology %in% c("case"))))/(Count_control / (sum(cohort$arrest_ontology == "control"))))
#assign in the module gene count data
df_gene_counts <- data.frame(
Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis",
"Rasopathy", "SNS_PNS", "Z_disc", "cytokine",
"fate_specification", "lysosomal_storage", "mitochondria",
"nuclear_envelope", "sarcomere"),
Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)
significant_edges_df <- comparison_df %>%
filter(adjusted_p_value < 0.05)
# Create an igraph object with edge attributes using filtered dataframe
network_graph <- graph_from_data_frame(d = significant_edges_df[, c("Module1", "Module2")], directed = FALSE)
# Matching Num_Genes from df_gene_counts to vertices in the network_graph
V(network_graph)$Num_Genes <- df_gene_counts$Num_Genes[match(V(network_graph)$name, df_gene_counts$Module)]
# Add edge attributes for adjusted_p_value and Relative_Counts from the filtered data frame
E(network_graph)$adjusted_p_value <- significant_edges_df$adjusted_p_value
E(network_graph)$Relative_Counts <- significant_edges_df$Relative_Counts
# Plot the graph with ggraph
HIGH_network <- ggraph(network_graph, layout = 'fr') +
geom_edge_link(aes(color = -log10(adjusted_p_value), edge_width = Relative_Counts), alpha = 0.8) +
geom_node_point(aes(size = Num_Genes), color = "Black") +
geom_node_text(aes(label = name), repel = TRUE, point.padding = unit(0.01, "lines")) +
scale_size_continuous(range = c(3, 10)) +
theme_graph()
print(HIGH_network)
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Make the box plot for the module abundance
# Filter NAs and keep only rows with defined Modules
module_data <- module_data %>%
filter(!is.na(Module))
# Standardize Category labels
module_data <- module_data %>%
mutate(Category = ifelse(Category == "control", "Control", "Case"))
# Count up the HIGH variants
module_summary <- module_data %>%
group_by(Module, Category) %>%
summarise(Total_HIGH_variant = sum(HIGH, na.rm = TRUE), .groups = "drop")
# Count number of genes per module
genes_per_module <- modules %>%
group_by(Module) %>%
summarise(Num_Genes = n(), .groups = "drop")
# Merge gene counts into summary
module_summary <- module_summary %>%
left_join(genes_per_module, by = "Module")
# Calculate HIGHs relative to module size
module_summary <- module_summary %>%
mutate(Relative_Size = Total_HIGH_variant / Num_Genes)
# Filter to only "Case" category
module_summary_filtered <- module_summary %>%
filter(Category == "Case")
# Order modules by mean variant count
module_order <- module_summary_filtered %>%
group_by(Module) %>%
summarise(Sort_Metric = mean(Total_HIGH_variant, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(Sort_Metric)) %>%
pull(Module)
# Apply module order
module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)
# Plot
modules_HIGH_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category, size = Total_HIGH_variant, color = Relative_Size)) +
geom_point(shape = 15, alpha = 1) +
scale_size(range = c(3, 20), name = "Number of HIGHs") +
scale_color_gradient(low = "Grey", high = "#05618F", name = "# of HIGHs relative to Module size") +
theme_minimal() +
labs(title = "Total HIGH by Module (Cases)",
x = "Module",
y = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(modules_HIGH_plot)
Pull in uniprot data
# pull down data for MYH7 (uniprot ID P12883)
MYH7_data <- drawProteins::get_features("P12883")
## [1] "Download has worked"
MYH7_data <- drawProteins::feature_to_dataframe(MYH7_data)
# pull down data for MYBPC3 (uniprot ID Q14896)
MYBPC3_data <- drawProteins::get_features("Q14896")
## [1] "Download has worked"
MYBPC3_data <- drawProteins::feature_to_dataframe(MYBPC3_data)
# pull down data for SCN5A (uniprot ID Q14524)
SCN5A_data <- drawProteins::get_features("Q14524")
## [1] "Download has worked"
SCN5A_data <- drawProteins::feature_to_dataframe(SCN5A_data)
Plot PLP variants
MYH7_plot <- draw_canvas(MYH7_data)
MYH7_plot <- draw_chains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_domains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_regions(MYH7_plot, MYH7_data)
#MYH7_plot <- draw_phospho(MYH7_plot, MYH7_data)
variants <- data.frame(
begin = c(1712,1057,908,904,869,741,723,576,369,355),
y = rep(1, 10)
)
# Now, add these points to your plot
MYH7_plot <- MYH7_plot +
geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)
# Adjust themes and aesthetics as needed
MYH7_plot <- MYH7_plot + theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank())
MYH7_plot
##################################################################################################
MYBPC3_plot <- draw_canvas(MYBPC3_data)
MYBPC3_plot <- draw_chains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_domains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_regions(MYBPC3_plot, MYBPC3_data)
#MYBPC3_plot <- draw_phospho(MYBPC3_plot, MYBPC3_data)
#######Note, 7 variants are not shown becasue they are splice variants
variants <- data.frame(
begin = c(1098, 1096, 1064, 791, 542, 502, 495, 258, 219),
y = rep(1, 9)
)
# Now, add these points to your plot
MYBPC3_plot <- MYBPC3_plot +
geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)
# Adjust themes and aesthetics as needed
MYBPC3_plot <- MYBPC3_plot + theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank())
MYBPC3_plot
##################################################################################################
SCN5A_plot <- draw_canvas(SCN5A_data)
SCN5A_plot <- draw_chains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_domains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_regions(SCN5A_plot, SCN5A_data)
#SCN5A_plot <- draw_phospho(SCN5A_plot, SCN5A_data)
#######Note, 2 variants are not shown becasue they are splice variants
variants <- data.frame(
begin = c(1784,1595,1319,1316,845,828,814),
y = rep(1, 7)
)
# Now, add these points to your plot
SCN5A_plot <- SCN5A_plot +
geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)
# Adjust themes as needed
SCN5A_plot <- SCN5A_plot + theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank())
SCN5A_plot
Nonoding******************************************************************************************************************
Pull in the file that is just the ATAC data intervals overlaid on top of the Hi-C intervals which map to the genes of interest
# Read the PC-Hi-C ATAC overlay BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/SFRN_cohort_data/noncoding/noncoding_rare/EP_ATAC_intersect_final_output_nodups.bed", col_names = FALSE, col_types = cols(
X1 = col_character(), # Chromosome
X2 = col_double(), # Start Position
X3 = col_double() # End Position
))
Plot noncoding interval size histogram
# Calculate Interval Sizes
bed_data$interval_size <- bed_data$X3 - bed_data$X2
# Calculate the median of interval sizes
median_interval_size <- median(bed_data$interval_size, na.rm = TRUE)
# plot
interval_size_histogram <- ggplot(bed_data, aes(x = interval_size)) +
geom_histogram(binwidth = 100, fill = "lightblue", color = "black") +
geom_vline(aes(xintercept = median_interval_size), color = "black", linetype = "dashed", size = 1) +
xlab("Interval Size") +
ylab("Frequency") +
ggtitle("Histogram of Interval Sizes") +
theme_cowplot(12)
interval_size_histogram
Report the central tendancy
mean(bed_data$interval_size)
## [1] 497.4085
median(bed_data$interval_size)
## [1] 396
Now pull in the ATAC data that maps to genes of interest, including the promoter fragment the ATAC data mapped to
# Read the BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/EP_ATAC_intersect_final_output_annotated_unique.bed", col_names = c("Chromosome", "Start", "End", "Chromosome2", "GeneStart", "GeneEnd", "GeneNames"), col_types = cols(
Chromosome = col_character(),
Start = col_double(),
End = col_double(),
Chromosome2 = col_character(),
GeneStart = col_double(),
GeneEnd = col_double(),
GeneNames = col_character()
))
# Split the gene names in case there are multiple genes separated by semicolon
bed_data <- bed_data %>%
mutate(GeneNames = strsplit(GeneNames, ";")) %>%
unnest(GeneNames)
# Read the gene list
gene_list <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/panel_genes_simple.txt")
# Filter bed_data to keep only rows with genes in the gene list
bed_data <- bed_data %>%
filter(GeneNames %in% gene_list)
# Count the number of regions mapped to each gene
gene_count <- bed_data %>%
count(GeneNames)
print(gene_count)
## # A tibble: 347 × 2
## GeneNames n
## <chr> <int>
## 1 A2ML1 15
## 2 ABAT 45
## 3 ABCC9 35
## 4 ACADVL 3
## 5 ACTC1 9
## 6 ACTL6B 13
## 7 ACTN2 21
## 8 ADAM22 3
## 9 ADSL 35
## 10 AGL 59
## # ℹ 337 more rows
Plot histogram that is number of regions that map to the genes of interest
# Calculate the median
median_regions <- median(gene_count$n, na.rm = TRUE)
# plot
regions_per_gene <- ggplot(gene_count, aes(x = n)) +
geom_histogram(binwidth = 1, fill = "#B40F20", color = "#B40F20") +
geom_vline(aes(xintercept = median_regions), color = "black", linetype = "dashed", size = 1) +
xlab("Number of Regions") +
ylab("Frequency") +
theme_cowplot(12) +
ggtitle("Histogram of the Distribution of Number of Regions per Gene")
regions_per_gene
Report the number of genes per region
mean(gene_count$n)
## [1] 31.40922
median(gene_count$n)
## [1] 24
Report the region sizes
bed_data$region_size <- bed_data$End - bed_data$Start
# Sum region sizes for each gene
total_region_size_per_gene <- bed_data %>%
group_by(GeneNames) %>%
summarise(TotalRegionSize = sum(region_size))
mean(total_region_size_per_gene$TotalRegionSize)
## [1] 16239.12
median(total_region_size_per_gene$TotalRegionSize)
## [1] 12010
Plot total sequence space per gene
# Create histogram of Total Region Sizes
ggplot(total_region_size_per_gene, aes(x = TotalRegionSize)) +
geom_histogram(binwidth = 1000, fill = "blue", color = "black") +
xlab("Total Region Size") +
ylab("Frequency") +
ggtitle("Histogram of Total Region Sizes per Gene")
Pull information about the TSS for each gene
gene_list <- unique(bed_data$GeneNames)
# Select hg19 from Ensembl
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "https://grch37.ensembl.org")
# Get TSS positions along with strand information
tss_info <- getBM(attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
filters = 'hgnc_symbol',
values = gene_list,
mart = ensembl)
find TSS distance
# Make function to select the most upstream TSS based on strand
select_upstream_tss <- function(df) {
if (all(df$strand == 1)) {
return(data.frame(transcription_start_site = min(df$transcription_start_site))) # Forward strand
} else {
return(data.frame(transcription_start_site = max(df$transcription_start_site))) # Reverse strand
}
}
# Apply the function to each group of genes
tss_upstream <- tss_info %>%
group_by(hgnc_symbol) %>%
do(select_upstream_tss(.))
# Merge bed_data with tss_upstream on gene names
merged_data <- merge(bed_data, tss_upstream, by.x = "GeneNames", by.y = "hgnc_symbol")
# Calculate the midpoint (peak center) of each interval
merged_data$Center <- (merged_data$Start + merged_data$End) / 2
# Calculate the distance
merged_data$DistanceToTSS <- abs(merged_data$Center - merged_data$transcription_start_site)
Distance to TSS
mean(merged_data$DistanceToTSS)
## [1] 264213.1
median(merged_data$DistanceToTSS)
## [1] 176294
Plot distance to TSS histogram
# Create a histogram
median_distance <- median(merged_data$DistanceToTSS, na.rm = TRUE)
dist_tss <- ggplot(merged_data, aes(x = DistanceToTSS)) +
geom_histogram(binwidth = 0.1, fill = "#FFB3BA", color = "black") +
geom_vline(aes(xintercept = median_distance), color = "black", linetype = "dashed", size = 1) +
xlab("Distance to TSS (bp)") +
ylab("Frequency") +
theme_cowplot(12) +
ggtitle("Histogram of Distances to Transcription Start Sites (TSS)") +
scale_x_log10(breaks = 10^(0:6) * 10000, labels = scales::comma)
dist_tss
Pull all TSSs
# Retrieve TSS information for all genes along with HGNC symbols
all_tss_info <- getBM(
attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
mart = ensembl
)
# Filter out rows where hgnc_symbol is blank
filtered_all_tss_info <- all_tss_info %>%
filter(hgnc_symbol != "")
See if the peak is closer to another TSS than the contacted one!
# Function to select the most upstream TSS along with chromosome information based on strand, just like before, but for all genes
select_upstream_tss <- function(df) {
if (nrow(df) == 0) {
return(data.frame(chromosome_name = NA, transcription_start_site = NA)) # Return NA if there are no rows
}
if (all(df$strand == 1)) {
# Forward strand
tss <- min(df$transcription_start_site, na.rm = TRUE)
} else {
# Reverse strand
tss <- max(df$transcription_start_site, na.rm = TRUE)
}
chromosome <- df$chromosome_name[which.min(abs(df$transcription_start_site - tss))]
return(data.frame(chromosome_name = chromosome, transcription_start_site = tss))
}
# Apply the function to each group of genes
all_tss_upstream <- filtered_all_tss_info %>%
group_by(hgnc_symbol) %>%
do(select_upstream_tss(.)) %>%
ungroup()
# Prepend 'chr' to chromosome names
all_tss_upstream$chromosome_name <- paste0("chr", all_tss_upstream$chromosome_name)
Find the closest transcription start site (TSS) to each peak center and calculate the distance.
# Create GRanges object for all_tss_upstream
tss_ranges <- GRanges(
seqnames = all_tss_upstream$chromosome_name,
ranges = IRanges(start = all_tss_upstream$transcription_start_site, end = all_tss_upstream$transcription_start_site)
)
# Create GRanges object for merged_data
merged_data_ranges <- GRanges(
seqnames = merged_data$Chromosome,
ranges = IRanges(start = merged_data$Center, end = merged_data$Center)
)
# Find the nearest TSS for each midpoint (peak center)
nearest_hits <- nearest(merged_data_ranges, tss_ranges)
# Extract the closest TSS information
closest_tss_info <- all_tss_upstream[nearest_hits, ]
# Add closest TSS information to merged_data
merged_data$closest_gene <- closest_tss_info$hgnc_symbol
merged_data$closest_tss <- closest_tss_info$transcription_start_site
merged_data$closest_tss_chromosome <- closest_tss_info$chromosome_name
# Calculate distance to closest TSS
merged_data$distance_to_closest_tss <- abs(merged_data$Center - merged_data$closest_tss)
Check if the closest TSS gene matches the originally contacted gene
# Compare GeneNames with closest_gene and append
merged_data$gene_match <- merged_data$GeneNames == merged_data$closest_gene
Compute and print the percentage of matches and mismatches between the contacted gene and the closest TSS gene.
# Calculate the fraction of matches
fraction_match <- sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data)
fraction_mismatch <- 1-(sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data))
# Print the fraction
print(100*fraction_match)
## [1] 13.26727
print(100*fraction_mismatch)
## [1] 86.73273
Lets see which genes have regions mapped to them that are closer to other TSSs
# Aggregate data by GeneNames
gene_summary <- merged_data %>%
dplyr::group_by(GeneNames) %>%
dplyr::summarize(
any_true = any(gene_match, na.rm = TRUE),
any_false = any(!gene_match, na.rm = TRUE),
all_true = all(gene_match, na.rm = TRUE),
all_false = all(!gene_match, na.rm = TRUE)
) %>%
ungroup()
# Adjusted calculations to include 8 missing genes
adjusted_denominator <- nrow(gene_summary) + 16
percent_any_true <- sum(gene_summary$any_true) / adjusted_denominator * 100
number_any_true <- sum(gene_summary$any_true)
percent_any_false <- sum(gene_summary$any_false) / adjusted_denominator * 100
number_any_false <- sum(gene_summary$any_false)
percent_all_true <- sum(gene_summary$all_true) / adjusted_denominator * 100
number_all_true <- sum(gene_summary$all_true)
percent_all_false <- sum(gene_summary$all_false) / adjusted_denominator * 100
number_all_false <- sum(gene_summary$all_false)
percent_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false) / adjusted_denominator * 100
number_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false)
# Print the results
cat("Total genes in panel", 363 ,"\n")
## Total genes in panel 363
cat("Genes with no ATAC peaks:", 100*(16/363),"%, ",16,"\n")
## Genes with no ATAC peaks: 4.407713 %, 16
cat("Genes with at least one TRUE:", percent_any_true,"%, ", number_any_true,"\n")
## Genes with at least one TRUE: 42.69972 %, 155
cat("Genes with at least one FALSE:", percent_any_false,"%, ", number_any_false,"\n")
## Genes with at least one FALSE: 93.66391 %, 340
cat("Genes with only TRUE:", percent_all_true,"%, ", number_all_true,"\n")
## Genes with only TRUE: 1.928375 %, 7
cat("Genes with only FALSE:", percent_all_false,"%, ", number_all_false,"\n")
## Genes with only FALSE: 52.89256 %, 192
cat("Genes with both TRUE and FALSE:", percent_both_true_false,"%, ", number_both_true_false,"\n")
## Genes with both TRUE and FALSE: 40.77135 %, 148
input the count data
noncoding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/noncoding_frequency_by_individual.txt", header = TRUE, sep = "\t")
Categorize the data
categorized_noncoding_stats <- noncoding_stats %>%
mutate(Category_Group = case_when(
Category == "control" ~ "Control",
Category %in% "case" ~ "Case"
))
Aggregate and clean
#clean it up to remove inadvertent NAs or infs
categorized_noncoding_stats <- categorized_noncoding_stats %>%
dplyr::select(-ID, -Category) %>%
na.omit() %>%
dplyr::filter_all(all_vars(!is.infinite(.)))
# Aggregate the Data to compute mean and standard deviation for each numeric column
collapsed_noncoding_stats <- categorized_noncoding_stats %>%
group_by(Category_Group) %>%
summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE),
std = ~sd(., na.rm = TRUE))))
# Remove rows with NA values
collapsed_noncoding_stats <- na.omit(collapsed_noncoding_stats)
Perform permutation test
# List of variables to analyze
noncoding_variables_to_analyze <- c("variant_count", "Epilepsy_gnomAD.0.001","Epilepsy_ultrarare","Cardiomyopathy_gnomAD.0.001","Cardiomyopathy_ultrarare")
# Define the permutation function
permutation_test <- function(data, var, group_var, num_permutations = 10000) {
# Calculate the observed difference in means between the two groups
observed_stat <- abs(mean(data[[var]][data[[group_var]] == "Case"]) -
mean(data[[var]][data[[group_var]] == "Control"]))
# Create an empty vector to store the permutation statistics
perm_stats <- numeric(num_permutations)
# Perform the permutations
for (i in 1:num_permutations) {
# Randomly shuffle the group labels
permuted_group <- sample(data[[group_var]])
# Calculate the difference in means for the permuted data
permuted_stat <- abs(mean(data[[var]][permuted_group == "Case"]) -
mean(data[[var]][permuted_group == "Control"]))
# Store the permuted statistic
perm_stats[i] <- permuted_stat
}
# Calculate the p-value (proportion of permuted stats >= observed stat)
p_value <- mean(perm_stats >= observed_stat)
return(list(observed_stat = observed_stat, perm_stats = perm_stats, p_value = p_value))
}
# Initialize a dataframe to store p-values for each variable
p_value_results <- data.frame(Variable = character(), Observed_Stat = numeric(), p_value = numeric(), stringsAsFactors = FALSE)
# Loop through each variable and perform the permutation test
for (var in noncoding_variables_to_analyze) {
# Run the permutation test
test_result <- permutation_test(categorized_noncoding_stats, var, "Category_Group")
# Extract the permuted statistics, observed statistic, and p-value
perm_stats <- test_result$perm_stats
observed_stat <- test_result$observed_stat
p_value <- test_result$p_value
# Store the p-value and observed statistic in the dataframe
p_value_results <- rbind(p_value_results, data.frame(Variable = var,
Observed_Stat = observed_stat,
p_value = p_value))
# Create a data frame for plotting the permutation distribution
perm_df <- data.frame(perm_stats = perm_stats)
# Plot the permutation distribution with the observed statistic marked and p-value
p <- ggplot(perm_df, aes(x = perm_stats)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
geom_vline(aes(xintercept = observed_stat), color = "red", linetype = "dashed", size = 1) +
labs(title = paste("Permutation Test for", var),
x = "Permuted Statistic",
y = "Frequency",
subtitle = paste("Observed Statistic =", round(observed_stat, 4),
"| P-value =", format(p_value, digits = 4))) + # Add p-value to subtitle
theme_minimal()
# Print the plot
print(p)
}
# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
## Variable Observed_Stat p_value
## 1 variant_count 133.718673 0.0000
## 2 Epilepsy_gnomAD.0.001 1.972116 0.0108
## 3 Epilepsy_ultrarare 9.608424 0.0000
## 4 Cardiomyopathy_gnomAD.0.001 5.523130 0.0000
## 5 Cardiomyopathy_ultrarare 7.756665 0.0000
Define a function to compute the ratio of each column’s mean relative to the Control group and calculate confidence intervals
# Function to plot each column as a ratio to the means from Group 1
plot_column_ratio <- function(data, col_name) {
# Calculate the mean and standard deviation for Group 1
group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
# Calculate the ratio for each group relative to Group 1
data_summary <- data %>%
group_by(Category_Group) %>%
summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
sd_value = sd(.data[[col_name]], na.rm = TRUE),
n = n()) %>%
mutate(ratio = mean_value / group1_mean,
sem_value = sd_value / sqrt(n), # SEM calculation
sem_ratio = sem_value / group1_mean, # SEM ratio
ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI
return(list(summary_data = data_summary))
}
Iterate over selected columns, compute summary statistics, and store results in a dataframe.
# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_noncoding_stats), c("ID", "Category", "Category_Group"))
# Initialize an empty dataframe to store summary data
combined_noncoding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean =
numeric(), std = numeric(), stringsAsFactors = FALSE)
# Loop through each column and plot relative to Category_Group1
for (col in columns_to_plot) {
plot_data <- plot_column_ratio(categorized_noncoding_stats, col)
# Append summary data to combined_noncoding_stats_summary_df
combined_noncoding_stats_summary_df <- bind_rows(combined_noncoding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}
Filter and factorize selected noncoding variants, then generate a plot showing their mean ratios compared to the Control group
# Subset the data for the specified variables
selected_noncoding_data <- combined_noncoding_stats_summary_df %>%
filter(col_name %in% c(
"Cardiomyopathy_ultrarare",
"Epilepsy_ultrarare"),
Category_Group != "Control")
# Define the specific order for noncoding variants
levels_order <- c(
"Epilepsy_ultrarare",
"Cardiomyopathy_ultrarare"
)
# Factorize the 'col_name' column with the specified order
selected_noncoding_data$col_name <- factor(selected_noncoding_data$col_name, levels = levels_order)
# Plot
noncoding_stats_plot <- ggplot(selected_noncoding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
geom_point(position = position_dodge(width = 1), size = 3) +
geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 0.5) +
geom_vline(xintercept = 1, linetype = "dashed") +
scale_color_manual(values = "#FF6464") +
labs(title = "Ratio Compared to Group 1 +/-SEM",
y = "Variant",
x = "Ratio to Group 1 Mean",
color = "Category Group") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8, hjust = 1),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8),
plot.title = element_text(size = 8, hjust = 0.5),
plot.subtitle = element_text(size = 8),
plot.caption = element_text(size = 8),
plot.margin = margin(15, 15, 15, 15)) +
scale_x_continuous(limits = c(0.75, 2))
print(noncoding_stats_plot)
Input the data (change to your path)
# Pull in the gene data
gene_data_noncoding <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_noncoding_variants_by_gene.csv")
# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort.txt", sep = "\t", header = TRUE)
# add the arrest category to each
gene_data_noncoding$Category <- cohort$arrest_ontology[match(gene_data_noncoding$ID, cohort$CGM_id)]
#filter for discovery only
gene_data_noncoding_discovery <- gene_data_noncoding %>%
filter(Set == "Discovery")
Summarize the data by GENE, counting the number of variants for each gene Filter for Category = case and then group and summarize
variants_per_gene_case <- gene_data_noncoding_discovery %>%
filter(Category == "case") %>%
group_by(GENE) %>%
summarise(
noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_case)
## # A tibble: 424 × 2
## GENE noncoding_ultrarare
## <chr> <int>
## 1 A2ML1 95
## 2 AAMP 10
## 3 ABAT 64
## 4 ABCC9 0
## 5 AC011551.3 0
## 6 ACADVL 27
## 7 ACTC1 31
## 8 ACTL6B 172
## 9 ACTN2 28
## 10 ADAM22 0
## # ℹ 414 more rows
Filter for Category == control and then group and summarize
variants_per_gene_control <- gene_data_noncoding_discovery %>%
filter(Category == "control") %>%
group_by(GENE) %>%
summarise(
noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_control)
## # A tibble: 424 × 2
## GENE noncoding_ultrarare
## <chr> <int>
## 1 A2ML1 37
## 2 AAMP 1
## 3 ABAT 50
## 4 ABCC9 0
## 5 AC011551.3 0
## 6 ACADVL 9
## 7 ACTC1 7
## 8 ACTL6B 130
## 9 ACTN2 16
## 10 ADAM22 0
## # ℹ 414 more rows
Append panel source to the gene_data
# Ensure that the 'GENE' column in 'gene_data_noncoding_discovery' and 'Gene' in 'all_genes' are character type
gene_data_noncoding_discovery$GENE <- as.character(gene_data_noncoding_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)
# Find the index of each gene in 'all_genes'
# Then, use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_noncoding_discovery$Source <- all_genes$Source[match(gene_data_noncoding_discovery$GENE, all_genes$Gene)]
# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'
# View the first few rows to verify the new 'Source' has been added
head(gene_data_noncoding_discovery)
## ID GENE noncoding_ultrarare Set X X.1 Category
## 1 CGM0000969 RANGRF 0 Discovery NA NA case
## 2 CGM0000969 AGL 0 Discovery NA NA case
## 3 CGM0000969 RNF13 0 Discovery NA NA case
## 4 CGM0000969 PRKAG2 0 Discovery NA NA case
## 5 CGM0000969 JPH2 1 Discovery NA NA case
## 6 CGM0000969 RP11-156P1.2 0 Discovery NA NA case
## Source
## 1 Cardiomyopathy
## 2 Cardiomyopathy
## 3 Epilepsy
## 4 Cardiomyopathy
## 5 Cardiomyopathy
## 6 <NA>
Add the source to the category 1 variants list
variants_per_gene_control$Source <- all_genes$Source[match(variants_per_gene_control$GENE, all_genes$Gene)]
head(variants_per_gene_control)
## # A tibble: 6 × 3
## GENE noncoding_ultrarare Source
## <chr> <int> <chr>
## 1 A2ML1 37 Cardiomyopathy
## 2 AAMP 1 <NA>
## 3 ABAT 50 Epilepsy
## 4 ABCC9 0 Cardiomyopathy
## 5 AC011551.3 0 <NA>
## 6 ACADVL 9 Cardiomyopathy
Add the source to the case variants list
variants_per_gene_case$Source <- all_genes$Source[match(variants_per_gene_case$GENE, all_genes$Gene)]
head(variants_per_gene_case)
## # A tibble: 6 × 3
## GENE noncoding_ultrarare Source
## <chr> <int> <chr>
## 1 A2ML1 95 Cardiomyopathy
## 2 AAMP 10 <NA>
## 3 ABAT 64 Epilepsy
## 4 ABCC9 0 Cardiomyopathy
## 5 AC011551.3 0 <NA>
## 6 ACADVL 27 Cardiomyopathy
combine the variants together now
# Add a new column to indicate the category
variants_per_gene_control <- variants_per_gene_control %>%
mutate(Category = "control")
# Add a new column to indicate the category range
variants_per_gene_case <- variants_per_gene_case %>%
mutate(Category = "case")
# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_control, variants_per_gene_case)
# Print the combined dataset
print(combined_variants)
## # A tibble: 848 × 4
## GENE noncoding_ultrarare Source Category
## <chr> <int> <chr> <chr>
## 1 A2ML1 37 Cardiomyopathy control
## 2 AAMP 1 <NA> control
## 3 ABAT 50 Epilepsy control
## 4 ABCC9 0 Cardiomyopathy control
## 5 AC011551.3 0 <NA> control
## 6 ACADVL 9 Cardiomyopathy control
## 7 ACTC1 7 Cardiomyopathy control
## 8 ACTL6B 130 Epilepsy control
## 9 ACTN2 16 Cardiomyopathy control
## 10 ADAM22 0 Epilepsy control
## # ℹ 838 more rows
Plot the number of noncoding variant types for the genes
# Filter dataset where noncoding_ultrarare > 0
noncoding_ultrarare_data <- combined_variants %>%
filter(noncoding_ultrarare > 0) %>%
filter(!is.na(Source))
# Label function for noncoding_ultrarare plot
custom_labeller_noncoding <- function(variable, value) {
if (variable == "Category") {
return(ifelse(value == "1", "Control", "Case"))
}
return(value)
}
# Create the noncoding_ultrarare plot
noncoding_ultrarare_plot <- ggplot(noncoding_ultrarare_data, aes(x = GENE, y = Category, fill = noncoding_ultrarare)) +
geom_tile() +
scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"),
values = scales::rescale(c(0, 0.5, 1))) +
facet_wrap(~ Source, scales = "free_x", ncol = 1) +
labs(title = "Noncoding Ultrarare Heatmap",
x = "Gene",
y = "Category",
fill = "Count") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.background = element_blank(),
strip.placement = "outside") +
scale_y_discrete(labels = function(x) custom_labeller_noncoding("Category", x))
# Print the plot
print(noncoding_ultrarare_plot)
Extract pLI constraint scores from gnomAD, compute enrichment stats for noncoding ultrarare variants, and plot their significance relative to pLI
# gnomAD_data pLI constraint data
gnomad_data <- fread("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/gnomad.v2.1.1.lof_metrics.by_gene.txt")
# Function to fetch pLI
get_pLI <- function(gene_symbol, gnomad_data) {
result <- gnomad_data[gene == gene_symbol, .(gene, pLI)]
if (nrow(result) == 0) {
stop("Gene not found in gnomAD data.")
}
return(result)
}
# Initialize an empty data frame to store the results
final_merged_data <- data.frame(
GENE = character(),
p_value = numeric(),
pLI = numeric(),
Source = character(),
ratio = numeric(),
stringsAsFactors = FALSE
)
# Process each panel "Source" group separately
for (source in unique(noncoding_ultrarare_data$Source)) {
# Filter data by panel "Source"
source_data <- noncoding_ultrarare_data %>% filter(Source == source)
# Extract unique genes for this Source
unique_genes <- unique(source_data$GENE)
# Create a data frame to store pLI values
pli_values <- data.frame(GENE = character(), pLI = numeric(), stringsAsFactors = FALSE)
# Fetch pLI values for each gene
for (gene in unique_genes) {
try({
pli_value <- get_pLI(gene, gnomad_data)
pli_values <- rbind(pli_values, data.frame(GENE = gene, pLI = pli_value$pLI, stringsAsFactors = FALSE))
}, silent = TRUE)
}
# Compute p-values and ratios for each gene
p_values <- source_data %>%
group_by(GENE) %>%
summarise(
p_value = {
control_count <- sum(noncoding_ultrarare[Category == "control"], na.rm = TRUE)
case_count <- sum(noncoding_ultrarare[Category == "case"], na.rm = TRUE)
total_count <- sum(noncoding_ultrarare, na.rm = TRUE)
expected_control <- total_count * (537 / (537 + 506))
expected_case <- total_count * (506 / (537 + 506))
chisq_stat <- ((control_count - expected_control)^2 / expected_control) +
((case_count - expected_case)^2 / expected_case)
pchisq(chisq_stat, df = 1, lower.tail = FALSE)
},
ratio = {
control_count <- sum(noncoding_ultrarare[Category == "control"], na.rm = TRUE)
case_count <- sum(noncoding_ultrarare[Category == "case"], na.rm = TRUE)
if (is.na(control_count) | is.na(case_count) | control_count == 0) {
NA
} else {
(case_count / 506) / (control_count / 537)
}
},
.groups = "drop"
)
# Merge with pLI values
merged_data <- merge(p_values, pli_values, by = "GENE")
merged_data$Source <- source
# Append to the final data frame
final_merged_data <- rbind(final_merged_data, merged_data)
}
# Bonferroni correction
num_tests <- nrow(final_merged_data)
bonferroni_cutoff <- 0.05 / num_tests
# Filter for significant results
bonferroni_filtered_data <- final_merged_data %>%
filter(p_value < bonferroni_cutoff)
# Get top 12 genes by p-value per source
top10_labels <- bonferroni_filtered_data %>%
group_by(Source) %>%
slice_min(order_by = p_value, n = 12) %>%
ungroup()
# Filter for Cardiomyopathy genes
cardio_data <- bonferroni_filtered_data %>%
filter(Source == "Cardiomyopathy")
# Plot
reg_variant_plot <- ggplot(cardio_data, aes(y = -log10(p_value), x = pLI, color = ratio)) +
geom_point() +
geom_text(
data = top10_labels %>% filter(Source == "Cardiomyopathy"),
aes(label = GENE),
hjust = 0.5, vjust = -0.5
) +
scale_color_gradient(low = "#3C8C78", high = "#dcb43c", na.value = "grey") +
labs(
title = paste(
"P-Values of Noncoding Ultrarare Variants vs. pLI by Source (Bonferroni, p <",
format(bonferroni_cutoff, digits = 2), ")"
),
x = "pLI",
y = "-log10(p-value)",
color = "Ratio\n(Case / Control)"
) +
theme_cowplot(12)
# Print the plot
print(reg_variant_plot)
This is added from HOMER results
motif_data <- data.frame(
TranscriptionFactor = c("IRF4", "NFY", "NKX2.5", "PRDM1", "SMAD2", "ESRRA", "NFKB","ASCL1"),
PValue = c(1.00E-23, 1.00E-19, 1.00E-18, 1.00E-17, 1.00E-17, 1.00E-17, 1.00E-15, 1.00E-13)
)
# Create the plot
motif_plot <- ggplot(motif_data, aes(y = TranscriptionFactor, x = -log10(PValue))) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Transcription Factor P-values",
y = "Transcription Factor",
x = "-log10(P-value)") +
theme_cowplot(12)
motif_plot
Combined****************************************************************************************************************** Read the cohort data
# Read the cohort data
total_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/combined_data.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)
combined_data <- total_data %>% filter(Set =="Discovery")
replication_data <- total_data %>% filter(Set =="Replication")
Categorize the Data based on original categories and arrest status
#Clump by category
categorized_combined_data <- combined_data %>%
mutate(Category = case_when(
arrest_group == "control" ~ "Control",
arrest_group == "case" ~ "zCase"
)) %>%
filter(!is.na(Category))
#Convert Category to a factor
categorized_combined_data$Category <- as.factor(categorized_combined_data$Category)
############################ ############## ############## replication
#Clump replication by category z used for formatting reasons I am too lazy to fix
categorized_replication_data <- replication_data %>%
mutate(Category = case_when(
arrest_group == "control" ~ "Control",
arrest_group %in% "case" ~ "zCase"
)) %>%
filter(!is.na(Category))
# Convert Category to a factor
categorized_replication_data$Category <- as.factor(categorized_replication_data$Category)
Collapse the rare vars
categorized_combined_data <- categorized_combined_data %>%
mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))
categorized_combined_data <- categorized_combined_data %>%
mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))
categorized_combined_data <- categorized_combined_data %>%
mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))
categorized_combined_data <- categorized_combined_data %>%
mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))
categorized_combined_data <- categorized_combined_data %>%
mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))
categorized_combined_data <- categorized_combined_data %>%
mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))
############################ ############## ############## replication
categorized_replication_data <- categorized_replication_data %>%
mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))
categorized_replication_data <- categorized_replication_data %>%
mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))
categorized_replication_data <- categorized_replication_data %>%
mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))
categorized_replication_data <- categorized_replication_data %>%
mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))
categorized_replication_data <- categorized_replication_data %>%
mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))
categorized_replication_data <- categorized_replication_data %>%
mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))
Perform multivariate Logistic Regression on everything
model <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model
##
## Call: glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval +
## Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
## Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI +
## Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare +
## Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy +
## Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare +
## Cardiomyopathy_null + Epilepsy_null, family = binomial(),
## data = categorized_combined_data)
##
## Coefficients:
## (Intercept) Normalized_Heart_rate
## -1.13276 -0.06633
## Normalized_PR_interval Normalized_QTc
## 0.10545 -0.06282
## Normalized_BP Normalized_QRS
## 0.29888 0.41302
## Normalized_LVEF Normalized_LVESV
## -0.17686 0.16828
## Normalized_SVI Normalized_Afib
## 0.05697 -0.07965
## Normalized_LVEDVI Normalized_SV
## -0.09490 -0.09174
## Epilepsy_rare Epilepsy_ultrarare
## -0.06083 -0.01102
## Cardiomyopathy_rare Cardiomyopathy_ultrarare
## -0.03503 -0.01155
## PLP_Epilepsy PLP_Cardiomyopathy
## -0.19700 1.18960
## Cardiomyopathy_noncoding_rare Epilepsy_noncoding_rare
## -0.01056 0.03214
## Cardiomyopathy_null Epilepsy_null
## 0.96647 0.44278
##
## Degrees of Freedom: 992 Total (i.e. Null); 971 Residual
## Null Deviance: 1370
## Residual Deviance: 1132 AIC: 1176
Compute the scores
# tell it how to make the scores
scores <- predict(model, type = "link")
scores_replication <- predict(model, newdata = categorized_replication_data, type = "link")
# Add scores to the dataframes
categorized_combined_data$scores <- scores
categorized_replication_data$scores_replication <- scores_replication
Perform multivariate Logistic Regressions based only on the input paramaters and subsets
model_cardiomyopathy_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null, data = categorized_combined_data, family = binomial())
model_epilepsy_rare <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null, data = categorized_combined_data, family = binomial())
model_noncoding_rare <- glm(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare, data = categorized_combined_data, family = binomial())
model_common <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())
model_coding_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null, data = categorized_combined_data, family = binomial())
model_epilepsy_and_common <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())
model_cardiac_and_common <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())
model_common_and_coding <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, family = binomial())
Compute the rest of the scores and determine rank-based quintiles for all scores
categorized_combined_data$cardiac_variant_score <- predict(model_cardiomyopathy_rare, type = "link")
categorized_combined_data$epilepsy_variant_score <- predict(model_epilepsy_rare, type = "link")
categorized_combined_data$noncoding_variant_score <- predict(model_noncoding_rare, type = "link")
categorized_combined_data$common_variant_score <- predict(model_common, type = "link")
categorized_combined_data$common_and_coding_score <- predict(model_common_and_coding, type = "link")
categorized_combined_data$coding_rare_score <- predict(model_coding_rare, type = "link")
categorized_combined_data$epilepsy_and_common_score <- predict(model_epilepsy_and_common, type = "link")
categorized_combined_data$cardiac_and_common_score <- predict(model_cardiac_and_common, type = "link")
categorized_combined_data <- categorized_combined_data %>%
mutate(
rank = rank(scores, ties.method = "first"),
score_quintiles = ceiling(rank / (n() / 5)),
rank_cardiac = rank(cardiac_variant_score, ties.method = "first"),
cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
rank_epilepsy = rank(epilepsy_variant_score, ties.method = "first"),
epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
rank_noncoding = rank(noncoding_variant_score, ties.method = "first"),
noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
rank_common = rank(common_variant_score, ties.method = "first"),
common_score_quintiles = ceiling(rank_common / (n() / 5)),
rank_common_and_coding = rank(common_and_coding_score, ties.method = "first"),
common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
rank_model_coding_rare = rank(coding_rare_score, ties.method = "first"),
coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
rank_epilepsy_and_common = rank(epilepsy_and_common_score, ties.method = "first"),
epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
rank_cardiac_and_common = rank(cardiac_and_common_score, ties.method = "first"),
cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
)
categorized_combined_data <- categorized_combined_data %>%
mutate(
probability = plogis(scores),
cardiac_probability = plogis(cardiac_variant_score),
epilepsy_probability = plogis(epilepsy_variant_score),
noncoding_probability = plogis(noncoding_variant_score),
common_probability = plogis(common_variant_score),
common_and_coding_probability = plogis(common_and_coding_score),
coding_rare_probability = plogis(coding_rare_score),
epilepsy_and_common_probability = plogis(epilepsy_and_common_score),
cardiac_and_common_probability = plogis(cardiac_and_common_score),
)
# Function to calculate odds ratio and CI
calculate_odds_ratios <- function(data, category_column, quintile_column) {
# Compute counts and initial odds for each quintile
odds_data <- data %>%
dplyr::group_by({{ quintile_column }}) %>%
dplyr::summarise(
count_category_1 = sum({{ category_column }} == "Control", na.rm = TRUE),
count_category_case = sum({{ category_column }} == "zCase", na.rm = TRUE),
.groups = 'drop'
) %>%
dplyr::mutate(
odds = count_category_case / count_category_1
)
# Retrieve the odds of the first quintile as the reference
first_quintile_odds <- odds_data$odds[1]
# Calculate the odds ratio relative to the first quintile
odds_data <- odds_data %>%
dplyr::mutate(
odds_ratio = odds / first_quintile_odds,
se_log_odds_ratio = sqrt(1 / count_category_1 + 1 / count_category_case),
lower_ci = exp(log(odds_ratio) - 1.96 * se_log_odds_ratio),
upper_ci = exp(log(odds_ratio) + 1.96 * se_log_odds_ratio)
)
return(odds_data)
}
# Apply function to each odds category
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
cardiac_odds_summary = calculate_odds_ratios(categorized_combined_data,Category, cardiac_score_quintiles)
epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category, epilepsy_score_quintiles)
common_summary = calculate_odds_ratios(categorized_combined_data,Category, common_score_quintiles)
noncoding_summary = calculate_odds_ratios(categorized_combined_data,Category, noncoding_score_quintiles)
common_and_coding_summary = calculate_odds_ratios(categorized_combined_data,Category, common_and_coding_score_quintiles)
coding_rare_summary = calculate_odds_ratios(categorized_combined_data,Category, coding_rare_score_quintiles)
commmon_and_epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary = calculate_odds_ratios(categorized_combined_data,Category, cardiac_and_common_score_quintiles)
################################################################################################################ replication
categorized_replication_data$cardiac_variant_score_replication <- predict(model_cardiomyopathy_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$epilepsy_variant_score_replication <- predict(model_epilepsy_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$noncoding_variant_score_replication <- predict(model_noncoding_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$common_variant_score_replication <- predict(model_common, newdata = categorized_replication_data, type = "link")
categorized_replication_data$common_and_coding_score_replication <- predict(model_common_and_coding, newdata = categorized_replication_data, type = "link")
categorized_replication_data$coding_rare_score_replication <- predict(model_coding_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$epilepsy_and_common_score_replication <- predict(model_epilepsy_and_common, newdata = categorized_replication_data, type = "link")
categorized_replication_data$cardiac_and_common_score_replication <- predict(model_cardiac_and_common, newdata = categorized_replication_data, type = "link")
categorized_replication_data <- categorized_replication_data %>%
mutate(
rank = rank(scores_replication, ties.method = "first"),
score_quintiles = ceiling(rank / (n() / 5)),
rank_cardiac = rank(cardiac_variant_score_replication, ties.method = "first"),
cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
rank_epilepsy = rank(epilepsy_variant_score_replication, ties.method = "first"),
epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
rank_noncoding = rank(noncoding_variant_score_replication, ties.method = "first"),
noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
rank_common = rank(common_variant_score_replication, ties.method = "first"),
common_score_quintiles = ceiling(rank_common / (n() / 5)),
rank_common_and_coding = rank(common_and_coding_score_replication, ties.method = "first"),
common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
rank_model_coding_rare = rank(coding_rare_score_replication, ties.method = "first"),
coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
rank_epilepsy_and_common = rank(epilepsy_and_common_score_replication, ties.method = "first"),
epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
rank_cardiac_and_common = rank(cardiac_and_common_score_replication, ties.method = "first"),
cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
)
categorized_replication_data <- categorized_replication_data %>%
mutate(
probability = plogis(scores_replication),
cardiac_probability = plogis(cardiac_variant_score_replication),
epilepsy_probability = plogis(epilepsy_variant_score_replication),
noncoding_probability = plogis(noncoding_variant_score_replication),
common_probability = plogis(common_variant_score_replication),
common_and_coding_probability = plogis(common_and_coding_score_replication),
coding_rare_probability = plogis(coding_rare_score_replication),
epilepsy_and_common_probability = plogis(epilepsy_and_common_score_replication),
cardiac_and_common_probability = plogis(cardiac_and_common_score_replication),
)
# Apply function to each odds category
combined_odds_summary_replication = calculate_odds_ratios(categorized_replication_data, Category, score_quintiles)
cardiac_odds_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, cardiac_score_quintiles)
epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, epilepsy_score_quintiles)
common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, common_score_quintiles)
noncoding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, noncoding_score_quintiles)
common_and_coding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, common_and_coding_score_quintiles)
coding_rare_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, coding_rare_score_quintiles)
commmon_and_epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, cardiac_and_common_score_quintiles)
plot
plot(log2(combined_odds_summary$odds_ratio),
ylim = c(
log2(min(c(combined_odds_summary$lower_ci, cardiac_odds_summary$lower_ci, epilepsy_summary$lower_ci,
common_summary$lower_ci, noncoding_summary$lower_ci, common_and_coding_summary$lower_ci,
coding_rare_summary$lower_ci))),
log2(max(c(combined_odds_summary$upper_ci, cardiac_odds_summary$upper_ci, epilepsy_summary$upper_ci,
common_summary$upper_ci, noncoding_summary$upper_ci, common_and_coding_summary$upper_ci,
coding_rare_summary$upper_ci)))
),
pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI",
# Add error bars for 95% CI - Combined
xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary$odds_ratio), log2(combined_odds_summary$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary$odds_ratio),
y0 = log2(combined_odds_summary$lower_ci),
y1 = log2(combined_odds_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
# Cardiac
lines(1:length(cardiac_odds_summary$odds_ratio), log2(cardiac_odds_summary$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary$odds_ratio),
y0 = log2(cardiac_odds_summary$lower_ci),
y1 = log2(cardiac_odds_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#2E86C1")
# Epilepsy
lines(1:length(epilepsy_summary$odds_ratio), log2(epilepsy_summary$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary$odds_ratio),
y0 = log2(epilepsy_summary$lower_ci),
y1 = log2(epilepsy_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#A93226")
# Common
lines(1:length(common_summary$odds_ratio), log2(common_summary$odds_ratio), col = "#229954")
points(log2(common_summary$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary$odds_ratio),
y0 = log2(common_summary$lower_ci),
y1 = log2(common_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#229954")
# Noncoding
lines(1:length(noncoding_summary$odds_ratio), log2(noncoding_summary$odds_ratio), col = "#D68910")
points(log2(noncoding_summary$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary$odds_ratio),
y0 = log2(noncoding_summary$lower_ci),
y1 = log2(noncoding_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#D68910")
# common and coding summary
lines(1:length(common_and_coding_summary$odds_ratio), log2(common_and_coding_summary$odds_ratio), col = "Black")
points(log2(common_and_coding_summary$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary$odds_ratio),
y0 = log2(common_and_coding_summary$lower_ci),
y1 = log2(common_and_coding_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Black")
# coding rare summary
lines(1:length(coding_rare_summary$odds_ratio), log2(coding_rare_summary$odds_ratio), col = "Pink")
points(log2(coding_rare_summary$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary$odds_ratio),
y0 = log2(coding_rare_summary$lower_ci),
y1 = log2(coding_rare_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Pink")
# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary$odds_ratio), labels = TRUE)
# legend
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"),
col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)
plot replication
plot(log2(combined_odds_summary_replication$odds_ratio),
ylim = c(-2,10),
pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI",
# Add error bars for 95% CI - Combined
xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary_replication$odds_ratio), log2(combined_odds_summary_replication$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary_replication$odds_ratio),
y0 = log2(combined_odds_summary_replication$lower_ci),
y1 = log2(combined_odds_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
# Cardiac
lines(1:length(cardiac_odds_summary_replication$odds_ratio), log2(cardiac_odds_summary_replication$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary_replication$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary_replication$odds_ratio),
y0 = log2(cardiac_odds_summary_replication$lower_ci),
y1 = log2(cardiac_odds_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#2E86C1")
# Epilepsy
lines(1:length(epilepsy_summary_replication$odds_ratio), log2(epilepsy_summary_replication$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary_replication$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary_replication$odds_ratio),
y0 = log2(epilepsy_summary_replication$lower_ci),
y1 = log2(epilepsy_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#A93226")
# Common
lines(1:length(common_summary_replication$odds_ratio), log2(common_summary_replication$odds_ratio), col = "#229954")
points(log2(common_summary_replication$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary_replication$odds_ratio),
y0 = log2(common_summary_replication$lower_ci),
y1 = log2(common_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#229954")
# Noncoding
lines(1:length(noncoding_summary_replication$odds_ratio), log2(noncoding_summary_replication$odds_ratio), col = "#D68910")
points(log2(noncoding_summary_replication$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary_replication$odds_ratio),
y0 = log2(noncoding_summary_replication$lower_ci),
y1 = log2(noncoding_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#D68910")
# common and coding summary
lines(1:length(common_and_coding_summary_replication$odds_ratio), log2(common_and_coding_summary_replication$odds_ratio), col = "Black")
points(log2(common_and_coding_summary_replication$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary_replication$odds_ratio),
y0 = log2(common_and_coding_summary_replication$lower_ci),
y1 = log2(common_and_coding_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Black")
# coding rare summary
lines(1:length(coding_rare_summary_replication$odds_ratio), log2(coding_rare_summary_replication$odds_ratio), col = "Pink")
points(log2(coding_rare_summary_replication$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary_replication$odds_ratio),
y0 = log2(coding_rare_summary_replication$lower_ci),
y1 = log2(coding_rare_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Pink")
# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary_replication$odds_ratio), labels = TRUE)
# legend
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"),
col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)
Z test for odds significance
compare_odds <- function(odds1, odds2) {
# Calculate the difference in log odds
log_odds1 <- log(odds1$odds_ratio)
log_odds2 <- log(odds2$odds_ratio)
delta_log_odds <- log_odds1 - log_odds2
# Calculate the standard error of the difference
se1 <- odds1$se_log_odds_ratio
se2 <- odds2$se_log_odds_ratio
se_delta_log_odds <- sqrt(se1^2 + se2^2)
# Z-test for each quintile
z_values <- delta_log_odds / se_delta_log_odds
p_values <- 2 * pnorm(abs(z_values), lower.tail = FALSE)
# Return a data frame with the results
return(data.frame(quintile = 1:length(p_values), p_values = p_values))
}
# Apply the function to compare 'coding_rare_summary', 'common_and_coding_summary', and 'combined_odds_summary'
p_values_coding_vs_common_coding <- compare_odds(coding_rare_summary, common_and_coding_summary)
p_values_coding_vs_model <- compare_odds(coding_rare_summary, combined_odds_summary)
p_values_common_coding_vs_model <- compare_odds(common_and_coding_summary, combined_odds_summary)
# Combine the data frames with an identifier for each comparison
p_values_coding_vs_common_coding$comparison <- "Coding vs Common and Coding"
p_values_coding_vs_model$comparison <- "Coding vs Model"
p_values_common_coding_vs_model$comparison <- "Common and Coding vs Model"
# Bind the rows together
combined_p_values <- bind_rows(p_values_coding_vs_common_coding,
p_values_coding_vs_model,
p_values_common_coding_vs_model)
# Convert quintile to a factor
combined_p_values$quintile <- factor(combined_p_values$quintile, levels = 1:5)
combined_p_values
## quintile p_values comparison
## 1 1 1.0000000000 Coding vs Common and Coding
## 2 2 0.0302326527 Coding vs Common and Coding
## 3 3 0.1655321102 Coding vs Common and Coding
## 4 4 0.0016031606 Coding vs Common and Coding
## 5 5 0.0376069426 Coding vs Common and Coding
## 6 1 1.0000000000 Coding vs Model
## 7 2 0.0459511700 Coding vs Model
## 8 3 0.0041855535 Coding vs Model
## 9 4 0.0006605028 Coding vs Model
## 10 5 0.0006065780 Coding vs Model
## 11 1 1.0000000000 Common and Coding vs Model
## 12 2 0.8875234087 Common and Coding vs Model
## 13 3 0.1454995597 Common and Coding vs Model
## 14 4 0.8097807932 Common and Coding vs Model
## 15 5 0.1582202334 Common and Coding vs Model
Calculate the scores per category and plot them
#plot
common_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = common_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2, 2) +
theme_cowplot(12)
cardiac_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = cardiac_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-1, 1) +
theme_cowplot(12)
epilepsy_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = epilepsy_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-1, 1) +
theme_cowplot(12)
noncoding_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = noncoding_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-1, 0.75) +
theme_cowplot(12)
scores_plot <- ggplot(categorized_combined_data, aes(x = Category, y = scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-3, 3) +
theme_cowplot(12)
suppressWarnings(print(common_variant_score_plot))
suppressWarnings(print(cardiac_variant_score_plot))
suppressWarnings(print(epilepsy_variant_score_plot))
suppressWarnings(print(noncoding_variant_score_plot))
suppressWarnings(print(scores_plot))
replication
Now plot the Z normalized replication data
mean_common_replication <- mean(categorized_replication_data$common_variant_score_replication)
sd_common_replication <- sd(categorized_replication_data$common_variant_score_replication)
mean_cardiac_replication <- mean(categorized_replication_data$cardiac_variant_score_replication)
sd_cardiac_replication <- sd(categorized_replication_data$cardiac_variant_score_replication)
mean_epilepsy_replication <- mean(categorized_replication_data$epilepsy_variant_score_replication)
sd_epilepsy_replication <- sd(categorized_replication_data$epilepsy_variant_score_replication)
mean_noncoding_replication <- mean(categorized_replication_data$noncoding_variant_score_replication)
sd_noncoding_replication <- sd(categorized_replication_data$noncoding_variant_score_replication)
mean_scores_replication <- mean(categorized_replication_data$scores_replication)
sd_scores_replication <- sd(categorized_replication_data$scores_replication)
common_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (common_variant_score_replication - mean_common_replication) / sd_common_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
cardiac_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (cardiac_variant_score_replication - mean_cardiac_replication)/ sd_cardiac_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
epilepsy_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (epilepsy_variant_score_replication - mean_epilepsy_replication)/ sd_epilepsy_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
noncoding_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (noncoding_variant_score_replication - mean_noncoding_replication)/ sd_noncoding_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
scores_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication - mean_scores_replication)/ sd_scores_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
suppressWarnings(print(common_variant_score_replication_plot))
suppressWarnings(print(cardiac_variant_score_replication_plot))
suppressWarnings(print(epilepsy_variant_score_replication_plot))
suppressWarnings(print(noncoding_variant_score_replication_plot))
suppressWarnings(print(scores_replication_plot))
Compute the wilcox.tests
wilcox.test_cardiomyopathy <- wilcox.test(cardiac_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_cardiomyopathy
##
## Wilcoxon rank sum test with continuity correction
##
## data: cardiac_variant_score by Category
## W = 76739, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy <- wilcox.test(epilepsy_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_epilepsy
##
## Wilcoxon rank sum test with continuity correction
##
## data: epilepsy_variant_score by Category
## W = 86944, p-value = 3.215e-15
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_common <- wilcox.test(common_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_common
##
## Wilcoxon rank sum test with continuity correction
##
## data: common_variant_score by Category
## W = 77863, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding <- wilcox.test(noncoding_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_noncoding
##
## Wilcoxon rank sum test with continuity correction
##
## data: noncoding_variant_score by Category
## W = 101916, p-value = 5.208e-06
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined <- wilcox.test(scores ~ Category, data = categorized_combined_data)
wilcox.test_combined
##
## Wilcoxon rank sum test with continuity correction
##
## data: scores by Category
## W = 57835, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
replication
wilcox.test_cardiomyopathy_replication <- wilcox.test(cardiac_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_cardiomyopathy_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: cardiac_variant_score_replication by Category
## W = 850, p-value = 3.419e-08
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy_replication <- wilcox.test(epilepsy_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_epilepsy_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: epilepsy_variant_score_replication by Category
## W = 1286, p-value = 0.0006998
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_common_replication <- wilcox.test(common_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_common_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: common_variant_score_replication by Category
## W = 1667, p-value = 0.1309
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding_replication <- wilcox.test(noncoding_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_noncoding_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: noncoding_variant_score_replication by Category
## W = 523, p-value = 1.213e-12
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined_replication <- wilcox.test(scores_replication ~ Category, data = categorized_replication_data)
wilcox.test_combined_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: scores_replication by Category
## W = 498, p-value = 4.973e-13
## alternative hypothesis: true location shift is not equal to 0
Plot the distribution of the categories by score
distribution_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) +
geom_density(aes(y = ..density..), alpha = 0.5, adjust = 1) +
scale_fill_manual(values = group_colors) +
scale_x_continuous(limits = c(-3, 4)) +
labs(title = "Normalized Density Plots of Scores by Category",
x = "Score",
y = "Density") +
theme_cowplot(12)
suppressWarnings(print(distribution_score_plot))
Plot the filled density of the categories by score
density_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) +
geom_density(position = "fill", alpha = 0.5) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(limits = c(-3, 4)) +
labs(title = "Stacked Density of Scores by Category",
x = "Score",
y = "Fraction") +
theme_cowplot() +
scale_fill_manual(values = group_colors)
# Print the plot
suppressWarnings(print(density_score_plot))
Plot the ROCs
# Convert 'Category' into a binary format: 0 for control, 1 for case
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
#Full model
roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("Full model AUC:")
## [1] "Full model AUC:"
auc(roc_result_full)
## Area under the curve: 0.7638
#common variants
roc_result_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("commonvariant model AUC:")
## [1] "commonvariant model AUC:"
auc(roc_result_common)
## Area under the curve: 0.682
#coding variants
roc_result_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$coding_rare_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("coding variant model AUC:")
## [1] "coding variant model AUC:"
auc(roc_result_coding)
## Area under the curve: 0.7008
#cardiac and common variants
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("coding variant model AUC:")
## [1] "coding variant model AUC:"
auc(roc_result_cardiac_and_common)
## Area under the curve: 0.7364
#common and coding variants
roc_result_common_and_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_and_coding_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("common and coding variant model AUC:")
## [1] "common and coding variant model AUC:"
auc(roc_result_common_and_coding)
## Area under the curve: 0.7449
Plot the replication ROC and Precision-recall
# Convert 'Category' into a binary format: 0 for control, 1 for case
categorized_replication_data$binary_outcome <- ifelse(categorized_replication_data$Category == "Control", 0, 1)
roc_result_full_replication <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full_replication)
## Area under the curve: 0.874
Figure out the ROC improvement with each variant class and plot it
#compute the ones not plotted
roc_result_cardiac <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)
# Base AUC for random chance
auc_random_chance <- 0.5
# Create a data frame for plotting
percentage_changes_df <- data.frame(
Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
PercentageChange = c(
(auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac - auc_random_chance) / auc_random_chance * 100,
(auc_common - auc_random_chance) / auc_random_chance * 100,
(auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
(auc_coding - auc_random_chance) / auc_random_chance * 100,
(auc_full - auc_random_chance) / auc_random_chance * 100
)
)
auc_df <- data.frame(
Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
auc = c(
(auc_epilepsy),
(auc_cardiac),
(auc_common),
(auc_epilepsy_and_common),
(auc_cardiac_and_common),
(auc_common_and_coding),
(auc_coding),
(auc_full)
)
)
# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
"Epilepsy","Cardiac","Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))
# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme_minimal() +
labs(title = "Percentage Change Over Random Chance AUC",
y = "Percentage Change (%)",
x = "") +
theme_cowplot(12)
print(auc_performance_plot)
# Calculate p-values using DeLong's test
p_values <- list(
"Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
"CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
"Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
"Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
"Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
"CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
"Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
"CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
"Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
"Full vs Common and Coding" = roc.test(roc_result_common_and_coding, roc_result_full)$p.value
)
# Print p-values
print(p_values)
## $`Epilepsy vs CMAR`
## [1] 0.03714183
##
## $`CMAR vs Common`
## [1] 0.8236934
##
## $`Epilepsy vs Common`
## [1] 0.05250968
##
## $`Coding vs CMAR`
## [1] 0.1295995
##
## $`Coding vs Epilepsy`
## [1] 4.40734e-05
##
## $`CMAR and Common vs Common`
## [1] 3.718671e-06
##
## $`Common vs Coding`
## [1] 0.336048
##
## $`CMAR and common vs Common and Coding`
## [1] 0.06421658
##
## $`Common vs Common and Coding`
## [1] 2.86077e-07
##
## $`Full vs Common and Coding`
## [1] 0.004615593
Compute the deviance attributable to each class
# Perform ANOVA on the model
null_model <- glm(Category ~ 1, data = categorized_combined_data, family = binomial())
# Perform ANOVA on the full model
anova_results <- anova(model, test = "Chisq")
# Calculate the null deviance and the full model deviance
null_deviance <- deviance(null_model)
full_deviance <- deviance(model)
# Calculate the total sum of squares (using total deviance)
tss <- null_deviance - full_deviance
# Calculate the proportion of deviance explained by each predictor
anova_results$Proportion_Variance_Explained <- anova_results$`Deviance` / tss
# Display the results
anova_results
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Category
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 992 1370.0
## Normalized_Heart_rate 1 8.094 991 1361.9 0.00444
## Normalized_PR_interval 1 6.193 990 1355.7 0.01282
## Normalized_QTc 1 5.429 989 1350.3 0.01980
## Normalized_BP 1 21.492 988 1328.8 0.00000
## Normalized_QRS 1 32.145 987 1296.6 0.00000
## Normalized_LVEF 1 11.611 986 1285.0 0.00066
## Normalized_LVESV 1 2.819 985 1282.2 0.09316
## Normalized_SVI 1 4.344 984 1277.8 0.03715
## Normalized_Afib 1 0.736 983 1277.1 0.39103
## Normalized_LVEDVI 1 6.693 982 1270.4 0.00968
## Normalized_SV 1 1.424 981 1269.0 0.23282
## Epilepsy_rare 1 7.220 980 1261.8 0.00721
## Epilepsy_ultrarare 1 11.748 979 1250.0 0.00061
## Cardiomyopathy_rare 1 6.754 978 1243.3 0.00935
## Cardiomyopathy_ultrarare 1 3.595 977 1239.7 0.05795
## PLP_Epilepsy 1 0.009 976 1239.7 0.92564
## PLP_Cardiomyopathy 1 46.892 975 1192.8 0.00000
## Cardiomyopathy_noncoding_rare 1 3.390 974 1189.4 0.06561
## Epilepsy_noncoding_rare 1 35.430 973 1154.0 0.00000
## Cardiomyopathy_null 1 16.730 972 1137.2 0.00004
## Epilepsy_null 1 4.795 971 1132.4 0.02855
## Proportion_Variance_Explained
## NULL
## Normalized_Heart_rate 0.034075
## Normalized_PR_interval 0.026072
## Normalized_QTc 0.022857
## Normalized_BP 0.090478
## Normalized_QRS 0.135322
## Normalized_LVEF 0.048878
## Normalized_LVESV 0.011867
## Normalized_SVI 0.018286
## Normalized_Afib 0.003097
## Normalized_LVEDVI 0.028176
## Normalized_SV 0.005993
## Epilepsy_rare 0.030394
## Epilepsy_ultrarare 0.049455
## Cardiomyopathy_rare 0.028434
## Cardiomyopathy_ultrarare 0.015135
## PLP_Epilepsy 0.000037
## PLP_Cardiomyopathy 0.197408
## Cardiomyopathy_noncoding_rare 0.014270
## Epilepsy_noncoding_rare 0.149152
## Cardiomyopathy_null 0.070428
## Epilepsy_null 0.020185
REDO the “improvement” testing in the replication cohort
# Compute ROC curves for all models
roc_result_cardiac <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$cardiac_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$epilepsy_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$epilepsy_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$cardiac_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_common_and_coding <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$common_and_coding_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_coding <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$coding_rare_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_full <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)
# Base AUC for random chance
auc_random_chance <- 0.5
# Create a data frame for plotting
percentage_changes_df <- data.frame(
Model = c("Epilepsy", "Cardiac", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
PercentageChange = c(
(auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac - auc_random_chance) / auc_random_chance * 100,
(auc_common - auc_random_chance) / auc_random_chance * 100,
(auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
(auc_coding - auc_random_chance) / auc_random_chance * 100,
(auc_full - auc_random_chance) / auc_random_chance * 100
)
)
auc_df <- data.frame(
Model = c("Epilepsy", "Cardiac", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
auc = c(
auc_epilepsy,
auc_cardiac,
auc_common,
auc_epilepsy_and_common,
auc_cardiac_and_common,
auc_common_and_coding,
auc_coding,
auc_full
)
)
# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
"Epilepsy", "Cardiac", "Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))
# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme_minimal() +
labs(title = "Percentage Change Over Random Chance AUC",
y = "Percentage Change (%)",
x = "") +
theme_cowplot(12)
print(auc_performance_plot)
# Calculate p-values using DeLong's test
p_values <- list(
"Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
"CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
"Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
"Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
"Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
"CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
"Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
"CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
"Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
"Full vs CMAR" = roc.test(roc_result_cardiac, roc_result_full)$p.value
)
# Print p-values
print(p_values)
## $`Epilepsy vs CMAR`
## [1] 0.05440609
##
## $`CMAR vs Common`
## [1] 0.0005775265
##
## $`Epilepsy vs Common`
## [1] 0.1143759
##
## $`Coding vs CMAR`
## [1] 0.7027383
##
## $`Coding vs Epilepsy`
## [1] 0.003616045
##
## $`CMAR and Common vs Common`
## [1] 0.004465682
##
## $`Common vs Coding`
## [1] 0.001003417
##
## $`CMAR and common vs Common and Coding`
## [1] 0.1669517
##
## $`Common vs Common and Coding`
## [1] 0.0009395074
##
## $`Full vs CMAR`
## [1] 0.04508866
Lets see how common and rare cardiac vars interact
# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
geom_point(alpha = 1) +
scale_color_manual(values = group_colors) +
labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
theme_cowplot(12) +
theme(strip.text.x = element_text(size = 10))
dot_plot
median_cardiac <- median(categorized_combined_data$rank_cardiac, na.rm = TRUE)
median_common <- median(categorized_combined_data$rank_common, na.rm = TRUE)
# Filter for the top half
quadrant1 <- categorized_combined_data %>%
filter(rank_cardiac <= median_cardiac & rank_common <= median_common)
quadrant2 <- categorized_combined_data %>%
filter(rank_cardiac < median_cardiac & rank_common > median_common)
quadrant3 <- categorized_combined_data %>%
filter(rank_cardiac > median_cardiac & rank_common < median_common)
quadrant4 <- categorized_combined_data %>%
filter(rank_cardiac >= median_cardiac & rank_common >= median_common)
# Combine into one data frame
combined_quadrants <- bind_rows(
quadrant1 %>% mutate(Quadrant = 'Quadrant 1'),
quadrant2 %>% mutate(Quadrant = 'Quadrant 2'),
quadrant3 %>% mutate(Quadrant = 'Quadrant 3'),
quadrant4 %>% mutate(Quadrant = 'Quadrant 4')
)
# Calculate the count of individuals in each quadrant for each category
# and the total count of individuals in each category
percentage_by_category <- combined_quadrants %>%
group_by(Category, Quadrant) %>%
summarise(Count = n(), .groups = 'drop') %>%
group_by(Category) %>%
mutate(Total = sum(Count), Percentage = Count / Total * 100)
# View the result
percentage_by_category
## # A tibble: 8 × 5
## # Groups: Category [2]
## Category Quadrant Count Total Percentage
## <fct> <chr> <int> <int> <dbl>
## 1 Control Quadrant 1 235 537 43.8
## 2 Control Quadrant 2 107 537 19.9
## 3 Control Quadrant 3 98 537 18.2
## 4 Control Quadrant 4 97 537 18.1
## 5 zCase Quadrant 1 79 456 17.3
## 6 zCase Quadrant 2 75 456 16.4
## 7 zCase Quadrant 3 84 456 18.4
## 8 zCase Quadrant 4 218 456 47.8
# Calculate overall proportions for each quadrant
overall_counts <- combined_quadrants %>%
count(Quadrant) %>%
mutate(OverallProportion = n / sum(n))
# Calculate total counts by category for scaling expected values
category_totals <- combined_quadrants %>%
count(Category) %>%
dplyr::rename(TotalInCategory = n)
# Calculate expected counts
expected_counts <- combined_quadrants %>%
dplyr::select(Category, Quadrant) %>%
distinct() %>%
left_join(overall_counts, by = "Quadrant") %>%
left_join(category_totals, by = "Category") %>%
mutate(Expected = OverallProportion * TotalInCategory)
# Calculate observed counts
observed_counts <- combined_quadrants %>%
count(Category, Quadrant) %>%
dplyr::rename(Observed = n)
# combine
combined_for_plotting <- combined_quadrants %>%
count(Category, Quadrant) %>%
dplyr::rename(Observed = n) %>%
left_join(overall_counts, by = "Quadrant") %>%
left_join(category_totals, by = "Category") %>%
mutate(Expected = OverallProportion * TotalInCategory, Ratio = Observed / Expected)
# Calculate the Observed/Expected ratio
combined_for_plotting <- combined_for_plotting %>%
mutate(Ratio = Observed / Expected)
# Plot the Observed/Expected ratio
quadrant_plot <- ggplot(combined_for_plotting, aes(x = Quadrant, y = Ratio, fill = Category)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(y = "Observed/Expected Ratio", x = "Quadrant", title = "Observed/Expected Ratios by Category and Quadrant") +
theme_cowplot(12) +
scale_fill_manual(values = group_colors) +
geom_hline(yintercept = 1, linetype = "dashed", color = "Black")
quadrant_plot
# Create the contingency table
contingency_table <- xtabs(Observed ~ Category + Quadrant, data = observed_counts)
# Print the contingency table
print(contingency_table)
## Quadrant
## Category Quadrant 1 Quadrant 2 Quadrant 3 Quadrant 4
## Control 235 107 98 97
## zCase 79 75 84 218
# Chi-squared test
chi_squared_result <- chisq.test(contingency_table)
# Print the results of the chi-squared test
print(chi_squared_result)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 124.91, df = 3, p-value < 2.2e-16
Filter high cardiac-score individuals, visualize their cardiac vs. common variant scores, and assess interactions
# Filter the data for cardiac_score_quintiles > 3
top_2_quintiles <- categorized_combined_data %>%
filter(cardiac_score_quintiles > 3)
common_density_score_plot <- ggplot(top_2_quintiles, aes(x = cardiac_variant_score, y = common_variant_score)) +
geom_point(aes(color = Category), alpha = 0.5) +
geom_smooth(method = "lm", aes(group = Category, color = Category)) +
labs(x = "Cardiac Variant Score", y = "Common Variant Score", title = "Cardiac Variant Score vs. Common Variant Score by Category") +
scale_color_manual(values = group_colors) +
theme_cowplot(12)
# Highlight the specific point in red
common_density_score_plot <- common_density_score_plot +
geom_point(data = subset(top_2_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = common_variant_score), color = "red")
# Print
print(common_density_score_plot)
## `geom_smooth()` using formula = 'y ~ x'
#Do the stats
model_common_cardiac <- lm(common_variant_score ~ cardiac_variant_score * Category, data = top_2_quintiles)
summary(model_common_cardiac)
##
## Call:
## lm(formula = common_variant_score ~ cardiac_variant_score * Category,
## data = top_2_quintiles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8514 -0.3866 0.0569 0.4076 1.5320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.22986 0.05737 -4.006 7.38e-05 ***
## cardiac_variant_score -0.16418 0.09335 -1.759 0.0794 .
## CategoryzCase 0.40285 0.07411 5.436 9.59e-08 ***
## cardiac_variant_score:CategoryzCase 0.16371 0.09635 1.699 0.0901 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6477 on 394 degrees of freedom
## Multiple R-squared: 0.1034, Adjusted R-squared: 0.09659
## F-statistic: 15.15 on 3 and 394 DF, p-value: 2.386e-09
Visualize the relationship between cardiac and epilepsy variant scores
epilepsy_density_score_plot <- ggplot(top_2_quintiles, aes(x = cardiac_variant_score, y = epilepsy_variant_score)) +
geom_point(aes(color = Category), alpha = 0.5) +
geom_smooth(method = "lm", aes(group = Category, color = Category)) +
labs(x = "Cardiac Variant Score", y = "Epilepsy Variant Score", title = "Cardiac Variant Score vs. Epilepsy Variant Score by Category") +
scale_color_manual(values = group_colors) +
theme_cowplot(12)
# Highlight the specific point in red
epilepsy_density_score_plot <- epilepsy_density_score_plot +
geom_point(data = subset(top_2_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = epilepsy_variant_score), color = "red")
# Print
print(epilepsy_density_score_plot)
## `geom_smooth()` using formula = 'y ~ x'
#Do the stats
model_epilepsy_cardiac <- lm(epilepsy_variant_score ~ cardiac_variant_score * Category, data = top_2_quintiles)
summary(model_epilepsy_cardiac)
##
## Call:
## lm(formula = epilepsy_variant_score ~ cardiac_variant_score *
## Category, data = top_2_quintiles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8187 -0.4169 -0.0110 0.3811 9.3169
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1577 0.0861 -1.832 0.0677 .
## cardiac_variant_score -0.1611 0.1401 -1.150 0.2509
## CategoryzCase -0.2170 0.1112 -1.952 0.0517 .
## cardiac_variant_score:CategoryzCase 0.9352 0.1446 6.468 2.95e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.972 on 394 degrees of freedom
## Multiple R-squared: 0.5589, Adjusted R-squared: 0.5555
## F-statistic: 166.4 on 3 and 394 DF, p-value: < 2.2e-16
Do the stats for the interaction data
# do wilcox_test
CMARR_epilepsy_wilcox_test_results <- wilcox.test(epilepsy_variant_score ~ Category,
data = top_2_quintiles %>% filter(Category %in% c("zCase", "Control")))
CMARR_epilepsy_wilcox_test_results
##
## Wilcoxon rank sum test with continuity correction
##
## data: epilepsy_variant_score by Category
## W = 11515, p-value = 4.869e-10
## alternative hypothesis: true location shift is not equal to 0
CMARR_common_wilcox_test_results <- wilcox.test(common_variant_score ~ Category,
data = top_2_quintiles %>% filter(Category %in% c("zCase", "Control")))
CMARR_common_wilcox_test_results
##
## Wilcoxon rank sum test with continuity correction
##
## data: common_variant_score by Category
## W = 11649, p-value = 1.063e-09
## alternative hypothesis: true location shift is not equal to 0
Calculate how well GAPS performs in PLP- individuals
no_PLPs_categorized_data <- categorized_combined_data %>%
filter(PLP_Cardiomyopathy < 1 & probability > 0.83)
# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative DISCOVERY controls over the threshold")
## [1] "number of PLP negative DISCOVERY controls over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group == 1))
## [1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative DISCOVERY cases over the threshold")
## [1] "number of PLP negative DISCOVERY cases over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group > 1))
## [1] 22
#for replication
no_PLPs_categorized_replication_data <- categorized_replication_data %>%
filter(PLP_Cardiomyopathy < 1 & probability > 0.7)
# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative REPLICATION controls over the threshold")
## [1] "number of PLP negative REPLICATION controls over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group == 1))
## [1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative REPLICATION cases over the threshold")
## [1] "number of PLP negative REPLICATION cases over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group > 1))
## [1] 22
NOW, retrain the model on the validation cohort and apply it to the discovery cohort
model_replication <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null,
data = categorized_replication_data, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_replication
##
## Call: glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval +
## Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
## Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI +
## Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare +
## Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy +
## Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare +
## Cardiomyopathy_null + Epilepsy_null, family = binomial(),
## data = categorized_replication_data)
##
## Coefficients:
## (Intercept) Normalized_Heart_rate
## -10.632909 -0.489645
## Normalized_PR_interval Normalized_QTc
## -0.468604 -0.520174
## Normalized_BP Normalized_QRS
## 0.909731 0.538230
## Normalized_LVEF Normalized_LVESV
## -0.863017 -0.791458
## Normalized_SVI Normalized_Afib
## 0.005339 0.842873
## Normalized_LVEDVI Normalized_SV
## -0.774451 0.390626
## Epilepsy_rare Epilepsy_ultrarare
## -0.127564 -1.765110
## Cardiomyopathy_rare Cardiomyopathy_ultrarare
## -0.560834 1.434416
## PLP_Epilepsy PLP_Cardiomyopathy
## -9.841170 0.909586
## Cardiomyopathy_noncoding_rare Epilepsy_noncoding_rare
## -0.051308 0.361626
## Cardiomyopathy_null Epilepsy_null
## 6.534668 0.974009
##
## Degrees of Freedom: 125 Total (i.e. Null); 104 Residual
## Null Deviance: 174.2
## Residual Deviance: 47.6 AIC: 91.6
scores_replication_model_on_discovery <- predict(model_replication, newdata = categorized_combined_data, type = "link")
# Add scores to the dataframes
categorized_combined_data$scores_replication_model_on_discovery <- scores_replication_model_on_discovery
# z normalize
mean_replication_on_discovery <- mean(categorized_combined_data$scores_replication_model_on_discovery)
sd_replication_on_discovery <- sd(categorized_combined_data$scores_replication_model_on_discovery)
scores_replication_model_on_discovery_plot <- ggplot(categorized_combined_data, aes(x = Category, y = (scores_replication_model_on_discovery - mean_replication_on_discovery)/sd_replication_on_discovery, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
scores_replication_model_on_discovery_plot
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# wilcox.test
wilcox.test_result_replication_on_discovery <- wilcox.test(scores_replication_model_on_discovery ~ Category, data = categorized_combined_data)
# View the results
print(wilcox.test_result_replication_on_discovery)
##
## Wilcoxon rank sum test with continuity correction
##
## data: scores_replication_model_on_discovery by Category
## W = 84150, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
Calculate GAPS on betas trained and applied to replication data (within-replication model)
scores_replication_model_on_replication <- predict(model_replication, newdata = categorized_replication_data, type = "link")
# Add scores to the dataframes
categorized_replication_data$scores_replication_model_on_replication <- scores_replication_model_on_replication
# z normalize
mean_replication_on_replication <- mean(categorized_replication_data$scores_replication_model_on_replication)
sd_replication_on_replication <- sd(categorized_replication_data$scores_replication_model_on_replication)
scores_replication_model_on_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication_model_on_replication - mean_replication_on_replication)/sd_replication_on_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
scores_replication_model_on_replication_plot
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# t-test
wilcox.test_replication_on_replication <- wilcox.test(scores_replication_model_on_replication ~ Category, data = categorized_replication_data)
# View the results
print(wilcox.test_replication_on_replication)
##
## Wilcoxon rank sum test with continuity correction
##
## data: scores_replication_model_on_replication by Category
## W = 97, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
Now calculate AUC for the original data using replication-derived betas (reverse validation)
categorized_combined_data <- categorized_combined_data %>%
mutate(
probability_replication_on_discovery = plogis(scores_replication_model_on_discovery),
)
# Convert 'Category' into a binary format: 0 for control, 1 for case
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability_replication_on_discovery, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full)
## Area under the curve: 0.6564
# Calculate the confidence interval for the AUC
ci_auc <- ci.auc(roc_result_full)
# Print the confidence interval
print(ci_auc)
## 95% CI: 0.6221-0.6906 (DeLong)
Replication model on discovery data Odds Ratio
categorized_combined_data <- categorized_combined_data %>%
mutate(
replication_model_on_discovery_rank = rank(scores_replication_model_on_discovery, ties.method = "first"),
replication_model_on_discovery_score_quintiles = ceiling(replication_model_on_discovery_rank / (n() / 5)),
)
# Apply function to each odds category
replication_model_on_discovery_combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, replication_model_on_discovery_score_quintiles)
plot(log2(replication_model_on_discovery_combined_odds_summary$odds_ratio),
ylim = c(-2,10
),
pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI",
xaxt = "n", col = "#3C8C78")
lines(1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio), log2(replication_model_on_discovery_combined_odds_summary$odds_ratio), col = "#3C8C78")
# Add error bars for 95% CI - Combined
arrows(x0 = 1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio),
y0 = log2(replication_model_on_discovery_combined_odds_summary$lower_ci),
y1 = log2(replication_model_on_discovery_combined_odds_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
ANCESTRY
Determine optimal PCs where R2 is not dropping and when RMSE does not rise
# Filter the data frame to include only rows where Category == 1
filtered_categorized_combined_data <- subset(categorized_combined_data, Category == "Control")
# Define the number of PCs to try
num_pcs <- 1:20
# Initialize a list to store model results
cv_results <- data.frame(PCs = num_pcs, R2 = NA, RMSE = NA)
# Perform cross-validation for each number of PCs
for (i in num_pcs) {
formula <- as.formula(paste0("scores ~ ", paste0("PC", 1:i, collapse = " + ")))
model <- train(formula, data = filtered_categorized_combined_data, method = "lm",
trControl = trainControl(method = "cv", number = 10))
# Store R² and RMSE for each model
cv_results$R2[i] <- model$results$Rsquared
cv_results$RMSE[i] <- model$results$RMSE
}
# View the results to choose the optimal number of PCs
print(cv_results)
## PCs R2 RMSE
## 1 1 0.07556031 0.8552564
## 2 2 0.17565864 0.8135567
## 3 3 0.18756609 0.8140989
## 4 4 0.18713526 0.8042655
## 5 5 0.18644367 0.8081648
## 6 6 0.18811397 0.8069692
## 7 7 0.20011136 0.8089212
## 8 8 0.18121482 0.8048622
## 9 9 0.17593950 0.8129213
## 10 10 0.17775437 0.8122386
## 11 11 0.16777788 0.8134462
## 12 12 0.17388580 0.8178958
## 13 13 0.16904333 0.8160226
## 14 14 0.16621049 0.8224263
## 15 15 0.16633919 0.8210972
## 16 16 0.14802014 0.8287491
## 17 17 0.16367316 0.8192258
## 18 18 0.16472407 0.8213723
## 19 19 0.16679931 0.8194001
## 20 20 0.16339218 0.8260343
ggplot(cv_results, aes(x = PCs)) +
geom_line(aes(y = R2), color = "blue") +
geom_line(aes(y = RMSE), color = "red") +
ylab("Model Performance (R² / RMSE)") +
ggtitle("Cross-Validation of PCs") +
theme_minimal()
Regress out the influence of the ancestry PCS
# Run the regression of scores on C1 to C10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
data = filtered_categorized_combined_data)
# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)
# Calculate the adjusted_scores by removing the effects of C1 to C10
categorized_combined_data$adjusted_scores <- categorized_combined_data$scores -
(ancestry_coefficients["(Intercept)"] +
categorized_combined_data$PC1 * ancestry_coefficients["PC1"] +
categorized_combined_data$PC2 * ancestry_coefficients["PC2"] +
categorized_combined_data$PC3 * ancestry_coefficients["PC3"] +
categorized_combined_data$PC4 * ancestry_coefficients["PC4"] +
categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
Get adjusted probabilities
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_combined_data <- categorized_combined_data %>%
mutate(adjusted_probability = plogis(adjusted_scores))
Perform PC clustering
# Extract PCA columns (PC1 to PC20)
pca_columns <- categorized_combined_data[, grep("^PC\\d+$", names(categorized_combined_data))]
# Perform GMM clustering
gmm_result <- Mclust(pca_columns, G = 4)
# Extract the cluster labels from the GMM result
initial_clusters <- gmm_result$classification
# Select the data points that belong to Cluster 1
cluster1_data <- pca_columns[initial_clusters == 1, ]
# Perform GMM sub-clustering on Cluster 1
sub_gmm_result <- Mclust(cluster1_data, G = 2)
# Integrate the sub-clustering results back into your overall clustering
final_clusters <- initial_clusters
# Assign new cluster labels to the points in Cluster 1 based on the sub-clustering
# We use +4 here to differentiate from the initial clusters (assuming there were 4 clusters initially)
final_clusters[initial_clusters == 1] <- sub_gmm_result$classification + 4
# Select the data points that belong to Cluster 2
cluster2_data <- pca_columns[initial_clusters == 2, ]
# Perform GMM sub-clustering on Cluster 2
sub_gmm_result2 <- Mclust(cluster2_data, G = 2)
# Integrate the sub-clustering results back into your overall clustering
final_clusters[initial_clusters == 2] <- sub_gmm_result2$classification + 6
Add cluster assignments to data
# Add the final cluster assignments to the original dataframe
categorized_combined_data$cluster_gmm <- as.factor(final_clusters)
# Visualize the clustering results
ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm)) +
geom_point() +
ggtitle("GMM Clustering of PCA Data") +
theme_minimal() +
scale_color_manual(values = rainbow(length(unique(final_clusters)))) +
theme(legend.position = "bottom")
contingency_table <- table(categorized_combined_data$cluster_gmm, categorized_combined_data$Ancestry)
contingency_df <- as.data.frame(contingency_table)
colnames(contingency_df) <- c("Cluster", "Ancestry", "Count")
# Bar plot to show the makeup of each cluster by ancestry
ggplot(contingency_df, aes(x = Cluster, y = Count, fill = Ancestry)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Makeup of Each Cluster by Ancestry") +
xlab("Cluster") + ylab("Count") +
theme_minimal() +
theme(legend.position = "bottom")
Plot the PCs, ancestry clusters informed by self-reported identities, and plot adjusted data
# Define the mapping of clusters to ancestry
cluster_to_ancestry <- c("6" = "EUR", "7" = "HIS", "5" = "AFR", "3" = "OTH", "4" = "OTH", "8" = "HIS")
# Add the new column to the dataframe
categorized_combined_data$cluster_gmm_Ancestry <- as.character(categorized_combined_data$cluster_gmm)
# Reassign the clusters based on the mapping
categorized_combined_data$cluster_gmm_Ancestry <- sapply(categorized_combined_data$cluster_gmm, function(x) cluster_to_ancestry[as.character(x)])
# Convert the new column to a factor
categorized_combined_data$cluster_gmm_Ancestry <- factor(categorized_combined_data$cluster_gmm_Ancestry, levels = unique(cluster_to_ancestry))
# Create a summary dataframe with counts
summary_data <- categorized_combined_data %>%
group_by(Category, cluster_gmm_Ancestry) %>%
summarise(count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.
# Define the palette
palette_colors <- wes_palette("FantasticFox1", length(unique(categorized_combined_data$cluster_gmm_Ancestry)), type = "continuous")
# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(categorized_combined_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
xlab("Category") +
ylab("Percentage") +
theme(legend.position = "bottom") +
scale_fill_manual(values = palette_colors) +
geom_text(data = summary_data, aes(label = count, y = count / sum(count), group = cluster_gmm_Ancestry),
position = position_fill(vjust = 0.5), color = "black") +
theme_cowplot(12)
# Visualize the clustering results
clusters <- ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm_Ancestry)) +
geom_point() +
ggtitle("GMM Clustering of PCA Data") +
theme_minimal() +
scale_color_manual(values = palette_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12)
# Generalized Plot for Scores
ancestry_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Generalized Plot for Adjusted Scores
ancestry_adjusted_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = adjusted_scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Adjusted Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Adjusted Scores by Category (not ancestry-specific)
ancestry_adjusted_total_scores <- ggplot(categorized_combined_data, aes(x = Category, y = adjusted_scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Adjusted Scores by Category",
x = "Category",
y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Specific Plot for Cardiac Variant Scores
cardiac_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = cardiac_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Cardiac Variant Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Cardiac Variant Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Specific Plot for Epilepsy Variant Scores
epilepsy_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = epilepsy_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Epilepsy Variant Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Epilepsy Variant Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Specific Plot for Common Variant Scores
common_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = common_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Common Variant Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Common Variant Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Specific Plot for Noncoding Variant Scores
noncoding_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = noncoding_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Noncoding Variant Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Noncoding Variant Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Print Plots
clusters
ancestry_makeup
suppressWarnings(print(ancestry_scores))
suppressWarnings(print(ancestry_adjusted_scores))
suppressWarnings(print(ancestry_adjusted_total_scores))
suppressWarnings(print(cardiac_variant_scores))
suppressWarnings(print(epilepsy_variant_scores))
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
suppressWarnings(print(common_variant_scores))
suppressWarnings(print(noncoding_variant_scores))
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
# Wilcoxon Test for Adjusted Scores
wilcox.test(adjusted_scores ~ Category, data = categorized_combined_data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: adjusted_scores by Category
## W = 79303, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# Wilcoxon Test within each ancestry group (Adjusted Scores)
adjusted_wilcox_results <- categorized_combined_data %>%
group_by(cluster_gmm_Ancestry) %>%
summarise(
p_value = wilcox.test(adjusted_scores ~ Category)$p.value
)
print(adjusted_wilcox_results)
## # A tibble: 4 × 2
## cluster_gmm_Ancestry p_value
## <fct> <dbl>
## 1 EUR 0.00000000433
## 2 HIS 0.0181
## 3 AFR 0.0000162
## 4 OTH 0.00548
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
group_by(cluster_gmm_Ancestry) %>%
summarise(
p_value = wilcox.test(scores ~ Category)$p.value
)
print(scores_wilcox_results)
## # A tibble: 4 × 2
## cluster_gmm_Ancestry p_value
## <fct> <dbl>
## 1 EUR 1.75e-12
## 2 HIS 2.43e- 3
## 3 AFR 1.17e- 7
## 4 OTH 1.32e- 3
Interaction model to evaluate ancestry x PGS interactions
# Interaction model: Does ancestry modify the effect of the variables on case/control?
interaction_model <- glm(Category ~ Normalized_LVEF * cluster_gmm_Ancestry +
Normalized_Heart_rate * cluster_gmm_Ancestry +
Normalized_SV * cluster_gmm_Ancestry +
Normalized_LVEDVI * cluster_gmm_Ancestry +
Normalized_Afib * cluster_gmm_Ancestry +
Normalized_PR_interval * cluster_gmm_Ancestry +
Normalized_LVESV * cluster_gmm_Ancestry +
Normalized_SVI * cluster_gmm_Ancestry +
Normalized_BP * cluster_gmm_Ancestry +
Normalized_QTc * cluster_gmm_Ancestry,
data = categorized_combined_data, family = binomial)
summary(interaction_model)
##
## Call:
## glm(formula = Category ~ Normalized_LVEF * cluster_gmm_Ancestry +
## Normalized_Heart_rate * cluster_gmm_Ancestry + Normalized_SV *
## cluster_gmm_Ancestry + Normalized_LVEDVI * cluster_gmm_Ancestry +
## Normalized_Afib * cluster_gmm_Ancestry + Normalized_PR_interval *
## cluster_gmm_Ancestry + Normalized_LVESV * cluster_gmm_Ancestry +
## Normalized_SVI * cluster_gmm_Ancestry + Normalized_BP * cluster_gmm_Ancestry +
## Normalized_QTc * cluster_gmm_Ancestry, family = binomial,
## data = categorized_combined_data)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.985996 0.189985 5.190
## Normalized_LVEF -0.056041 0.160970 -0.348
## cluster_gmm_AncestryHIS -2.618199 0.270454 -9.681
## cluster_gmm_AncestryAFR -1.501416 0.291432 -5.152
## cluster_gmm_AncestryOTH 0.085666 0.347552 0.246
## Normalized_Heart_rate -0.148686 0.127560 -1.166
## Normalized_SV -0.129167 0.144625 -0.893
## Normalized_LVEDVI 0.099653 0.166314 0.599
## Normalized_Afib -0.119832 0.128547 -0.932
## Normalized_PR_interval 0.233999 0.117628 1.989
## Normalized_LVESV 0.006579 0.184338 0.036
## Normalized_SVI -0.007414 0.176301 -0.042
## Normalized_BP 0.100302 0.128746 0.779
## Normalized_QTc 0.144560 0.129390 1.117
## Normalized_LVEF:cluster_gmm_AncestryHIS 0.161473 0.251740 0.641
## Normalized_LVEF:cluster_gmm_AncestryAFR -0.057026 0.243657 -0.234
## Normalized_LVEF:cluster_gmm_AncestryOTH 0.149925 0.379303 0.395
## cluster_gmm_AncestryHIS:Normalized_Heart_rate 0.279794 0.205353 1.363
## cluster_gmm_AncestryAFR:Normalized_Heart_rate 0.248188 0.199291 1.245
## cluster_gmm_AncestryOTH:Normalized_Heart_rate 0.021320 0.282468 0.075
## cluster_gmm_AncestryHIS:Normalized_SV 0.018127 0.241476 0.075
## cluster_gmm_AncestryAFR:Normalized_SV -0.005352 0.227930 -0.023
## cluster_gmm_AncestryOTH:Normalized_SV -0.099748 0.367285 -0.272
## cluster_gmm_AncestryHIS:Normalized_LVEDVI -0.059796 0.277594 -0.215
## cluster_gmm_AncestryAFR:Normalized_LVEDVI -0.012314 0.242344 -0.051
## cluster_gmm_AncestryOTH:Normalized_LVEDVI 0.162975 0.409278 0.398
## cluster_gmm_AncestryHIS:Normalized_Afib -0.063225 0.207940 -0.304
## cluster_gmm_AncestryAFR:Normalized_Afib 0.092741 0.189536 0.489
## cluster_gmm_AncestryOTH:Normalized_Afib 0.318674 0.279610 1.140
## cluster_gmm_AncestryHIS:Normalized_PR_interval -0.107243 0.195593 -0.548
## cluster_gmm_AncestryAFR:Normalized_PR_interval -0.006562 0.185089 -0.035
## cluster_gmm_AncestryOTH:Normalized_PR_interval 0.140872 0.354449 0.397
## cluster_gmm_AncestryHIS:Normalized_LVESV 0.611566 0.302932 2.019
## cluster_gmm_AncestryAFR:Normalized_LVESV 0.002846 0.265806 0.011
## cluster_gmm_AncestryOTH:Normalized_LVESV 0.084992 0.343738 0.247
## cluster_gmm_AncestryHIS:Normalized_SVI -0.031809 0.270997 -0.117
## cluster_gmm_AncestryAFR:Normalized_SVI 0.088692 0.237651 0.373
## cluster_gmm_AncestryOTH:Normalized_SVI 0.064705 0.353658 0.183
## cluster_gmm_AncestryHIS:Normalized_BP 0.081335 0.195361 0.416
## cluster_gmm_AncestryAFR:Normalized_BP 0.286601 0.198513 1.444
## cluster_gmm_AncestryOTH:Normalized_BP -0.185752 0.315792 -0.588
## cluster_gmm_AncestryHIS:Normalized_QTc -0.256980 0.198895 -1.292
## cluster_gmm_AncestryAFR:Normalized_QTc -0.261544 0.239132 -1.094
## cluster_gmm_AncestryOTH:Normalized_QTc -0.395822 0.295921 -1.338
## Pr(>|z|)
## (Intercept) 2.10e-07 ***
## Normalized_LVEF 0.7277
## cluster_gmm_AncestryHIS < 2e-16 ***
## cluster_gmm_AncestryAFR 2.58e-07 ***
## cluster_gmm_AncestryOTH 0.8053
## Normalized_Heart_rate 0.2438
## Normalized_SV 0.3718
## Normalized_LVEDVI 0.5490
## Normalized_Afib 0.3512
## Normalized_PR_interval 0.0467 *
## Normalized_LVESV 0.9715
## Normalized_SVI 0.9665
## Normalized_BP 0.4359
## Normalized_QTc 0.2639
## Normalized_LVEF:cluster_gmm_AncestryHIS 0.5212
## Normalized_LVEF:cluster_gmm_AncestryAFR 0.8150
## Normalized_LVEF:cluster_gmm_AncestryOTH 0.6926
## cluster_gmm_AncestryHIS:Normalized_Heart_rate 0.1730
## cluster_gmm_AncestryAFR:Normalized_Heart_rate 0.2130
## cluster_gmm_AncestryOTH:Normalized_Heart_rate 0.9398
## cluster_gmm_AncestryHIS:Normalized_SV 0.9402
## cluster_gmm_AncestryAFR:Normalized_SV 0.9813
## cluster_gmm_AncestryOTH:Normalized_SV 0.7859
## cluster_gmm_AncestryHIS:Normalized_LVEDVI 0.8295
## cluster_gmm_AncestryAFR:Normalized_LVEDVI 0.9595
## cluster_gmm_AncestryOTH:Normalized_LVEDVI 0.6905
## cluster_gmm_AncestryHIS:Normalized_Afib 0.7611
## cluster_gmm_AncestryAFR:Normalized_Afib 0.6246
## cluster_gmm_AncestryOTH:Normalized_Afib 0.2544
## cluster_gmm_AncestryHIS:Normalized_PR_interval 0.5835
## cluster_gmm_AncestryAFR:Normalized_PR_interval 0.9717
## cluster_gmm_AncestryOTH:Normalized_PR_interval 0.6910
## cluster_gmm_AncestryHIS:Normalized_LVESV 0.0435 *
## cluster_gmm_AncestryAFR:Normalized_LVESV 0.9915
## cluster_gmm_AncestryOTH:Normalized_LVESV 0.8047
## cluster_gmm_AncestryHIS:Normalized_SVI 0.9066
## cluster_gmm_AncestryAFR:Normalized_SVI 0.7090
## cluster_gmm_AncestryOTH:Normalized_SVI 0.8548
## cluster_gmm_AncestryHIS:Normalized_BP 0.6772
## cluster_gmm_AncestryAFR:Normalized_BP 0.1488
## cluster_gmm_AncestryOTH:Normalized_BP 0.5564
## cluster_gmm_AncestryHIS:Normalized_QTc 0.1963
## cluster_gmm_AncestryAFR:Normalized_QTc 0.2741
## cluster_gmm_AncestryOTH:Normalized_QTc 0.1810
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1370.0 on 992 degrees of freedom
## Residual deviance: 1091.5 on 949 degrees of freedom
## AIC: 1179.5
##
## Number of Fisher Scoring iterations: 4
Now redo the ancestry corrections by adjusting the variable prior to fully incorporating into GAPS
# List of Scores to Correct
score_list <- c("Normalized_QTc", "Normalized_BP", "Normalized_SVI", "Normalized_LVESV",
"Normalized_PR_interval", "Normalized_Afib", "Normalized_LVEDVI", "Normalized_SV",
"Normalized_Heart_rate", "Normalized_LVEF", "Epilepsy_rare", "Epilepsy_ultrarare",
"Cardiomyopathy_rare", "Cardiomyopathy_ultrarare", "PLP_Epilepsy", "PLP_Cardiomyopathy",
"Cardiomyopathy_noncoding_rare", "Epilepsy_noncoding_rare", "Cardiomyopathy_null", "Epilepsy_null")
# Adjust Each Score in the Training Data
for (score in score_list) {
# Run the regression of the score on PC1 to PC10
ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")),
data = filtered_categorized_combined_data)
# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)
# Adjust the score by removing ancestry effects
categorized_combined_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_combined_data[[score]] -
(ancestry_coefficients["(Intercept)"] +
categorized_combined_data$PC1 * ancestry_coefficients["PC1"] +
categorized_combined_data$PC2 * ancestry_coefficients["PC2"] +
categorized_combined_data$PC3 * ancestry_coefficients["PC3"] +
categorized_combined_data$PC4 * ancestry_coefficients["PC4"] +
categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
}
# Fit Logistic Regression Using Adjusted Scores
adjusted_score_list <- paste0(score_list, "_ANCESTRY_adjusted")
model_ANCESTRY_adjusted <- glm(as.formula(paste("Category ~", paste(adjusted_score_list, collapse = " + "))),
data = categorized_combined_data, family = binomial())
# Predict Scores for Training Data
categorized_combined_data$Scores_ANCESTRY_adjusted <- predict(model_ANCESTRY_adjusted, type = "link")
# Plot Adjusted Scores
ancestry_ADJUSTED_pre_by_category <- ggplot(categorized_combined_data, aes(x = Category, y = Scores_ANCESTRY_adjusted, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Adjusted Scores by Category", x = "Category", y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)
ancestry_ADJUSTED_pre_by_category
## Warning: Removed 35 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ancestry_ADJUSTED_pre_by_ancestry <- ggplot(categorized_combined_data,
aes(x = cluster_gmm_Ancestry,
y = Scores_ANCESTRY_adjusted,
fill = Category,
pattern = Category)) +
geom_boxplot_pattern(outlier.shape = NA,
notch = TRUE,
pattern_density = 0.1,
pattern_fill = "black",
pattern_spacing = 0.01,
pattern_angle = 90) +
labs(title = "Adjusted Scores by Ancestry",
x = "Ancestry Cluster",
y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
scale_pattern_manual(values = c("stripe", "crosshatch", "dot", "none")) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
ancestry_ADJUSTED_pre_by_ancestry
## Warning: Removed 35 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Compute ROC and AUC for Training Data
categorized_combined_data <- categorized_combined_data %>%
mutate(adjusted_probability = plogis(Scores_ANCESTRY_adjusted))
roc_result_full_PRE_ADJUSTED <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste("Training AUC:", auc(roc_result_full_PRE_ADJUSTED)))
## [1] "Training AUC: 0.758878107746088"
# Adjust Each Score in the Replication Data
for (score in score_list) {
# Run ancestry adjustment regression in the training set
ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")),
data = filtered_categorized_combined_data)
# Get the ancestry correction coefficients
ancestry_coefficients <- coef(ancestry_regression_model)
# Apply ancestry adjustment to the replication dataset
categorized_replication_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_replication_data[[score]] -
(ancestry_coefficients["(Intercept)"] +
categorized_replication_data$PC1 * ancestry_coefficients["PC1"] +
categorized_replication_data$PC2 * ancestry_coefficients["PC2"] +
categorized_replication_data$PC3 * ancestry_coefficients["PC3"] +
categorized_replication_data$PC4 * ancestry_coefficients["PC4"] +
categorized_replication_data$PC5 * ancestry_coefficients["PC5"] +
categorized_replication_data$PC6 * ancestry_coefficients["PC6"] +
categorized_replication_data$PC7 * ancestry_coefficients["PC7"] +
categorized_replication_data$PC8 * ancestry_coefficients["PC8"] +
categorized_replication_data$PC9 * ancestry_coefficients["PC9"] +
categorized_replication_data$PC10 * ancestry_coefficients["PC10"])
}
# Predict Using the Ancestry-Adjusted Replication Data
adjusted_score_list_replication <- paste0(score_list, "_ANCESTRY_adjusted")
# Predict using the model, ensuring we use ONLY adjusted scores
categorized_replication_data$Scores_ANCESTRY_adjusted_replication <- predict(model_ANCESTRY_adjusted,
newdata = categorized_replication_data[, adjusted_score_list_replication, drop = FALSE],
type = "link")
# Convert to Probabilities
categorized_replication_data <- categorized_replication_data %>%
mutate(adjusted_probability_replication = plogis(Scores_ANCESTRY_adjusted_replication))
# Compute ROC and AUC for Replication Data
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome,
categorized_replication_data$adjusted_probability_replication)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste("Replication AUC:", auc(roc_result_full_adjusted)))
## [1] "Replication AUC: 0.874778649127245"
plot(roc_result_full_adjusted, xlim = c(1, 0))
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
group_by(cluster_gmm_Ancestry) %>%
summarise(
p_value = wilcox.test(Scores_ANCESTRY_adjusted ~ Category)$p.value
)
print(scores_wilcox_results)
## # A tibble: 4 × 2
## cluster_gmm_Ancestry p_value
## <fct> <dbl>
## 1 EUR 6.46e-16
## 2 HIS 5.57e- 6
## 3 AFR 8.54e- 9
## 4 OTH 1.07e- 4
Now plot the datausing the ancestry corrections by adjusting the variable prior to fully incorporating into GAPS
# List of Common Variant Scores to Correct
common_variant_scores <- c("Normalized_QTc", "Normalized_BP", "Normalized_SVI", "Normalized_LVESV",
"Normalized_PR_interval", "Normalized_Afib", "Normalized_LVEDVI", "Normalized_SV",
"Normalized_Heart_rate", "Normalized_LVEF")
# Adjust Each Score in the Training Data
for (score in common_variant_scores) {
# Run the regression of the score on PC1 to PC10
ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")),
data = filtered_categorized_combined_data)
# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)
# Adjust the score by removing ancestry effects
categorized_combined_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_combined_data[[score]] -
(ancestry_coefficients["(Intercept)"] +
categorized_combined_data$PC1 * ancestry_coefficients["PC1"] +
categorized_combined_data$PC2 * ancestry_coefficients["PC2"] +
categorized_combined_data$PC3 * ancestry_coefficients["PC3"] +
categorized_combined_data$PC4 * ancestry_coefficients["PC4"] +
categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
}
# Fit Logistic Regression Using Adjusted Scores
adjusted_common_variant_scores <- paste0(common_variant_scores, "_ANCESTRY_adjusted")
model_common_variant_ADJUSTED_pre <- glm(as.formula(paste("Category ~", paste(adjusted_common_variant_scores, collapse = " + "))),
data = categorized_combined_data, family = binomial())
# Predict Common Variant Score
categorized_combined_data$Common_Variant_Score_ADJUSTED_pre <- predict(model_common_variant_ADJUSTED_pre, type = "link")
# Plot Common Variant Score by Category
common_variant_plot_by_category_ADJUSTED_pre <- ggplot(categorized_combined_data, aes(x = Category, y = Common_Variant_Score_ADJUSTED_pre, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Common Variant Score by Category", x = "Category", y = "Common Variant Score") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)
common_variant_plot_by_category_ADJUSTED_pre
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
# Plot Common Variant Score by Ancestry
common_variant_plot_by_ancestry_ADJUSTED_pre <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = Common_Variant_Score_ADJUSTED_pre, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Common Variant Score by Ancestry", x = "Ancestry Cluster", y = "Common Variant Score") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)
common_variant_plot_by_ancestry_ADJUSTED_pre
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
summarise(
p_value = wilcox.test(Common_Variant_Score_ADJUSTED_pre ~ Category)$p.value
)
print(scores_wilcox_results)
## p_value
## 1 5.584507e-20
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
group_by(cluster_gmm_Ancestry) %>%
summarise(
p_value = wilcox.test(Common_Variant_Score_ADJUSTED_pre ~ Category)$p.value
)
print(scores_wilcox_results)
## # A tibble: 4 × 2
## cluster_gmm_Ancestry p_value
## <fct> <dbl>
## 1 EUR 0.000000680
## 2 HIS 0.0000165
## 3 AFR 0.000238
## 4 OTH 0.0857
Compute the odds ratios in the highest quintile across ancestry before and after the post-integration adjustments
categorized_combined_data <- categorized_combined_data %>%
mutate(
rank_adjusted = rank(adjusted_scores, ties.method = "first"),
score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
)
#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]
# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_quintiles)
combined_odds_summary
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 162 36 0.222 1
## 2 2 138 61 0.442 1.99
## 3 3 116 82 0.707 3.18
## 4 4 89 110 1.24 5.56
## 5 5 32 167 5.22 23.5
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 82 21 0.256 1
## 2 2 56 18 0.321 1.26
## 3 3 38 21 0.553 2.16
## 4 4 13 13 1 3.90
## 5 5 4 17 4.25 16.6
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 69 10 0.145 1
## 2 2 56 9 0.161 1.11
## 3 3 41 13 0.317 2.19
## 4 4 49 17 0.347 2.39
## 5 5 15 9 0.6 4.14
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 8 2 0.25 1
## 2 2 19 24 1.26 5.05
## 3 3 29 39 1.34 5.38
## 4 4 23 69 3 12
## 5 5 11 112 10.2 40.7
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 3 3 1 1
## 2 2 7 10 1.43 1.43
## 3 3 8 9 1.12 1.12
## 4 4 4 11 2.75 2.75
## 5 5 2 29 14.5 14.5
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Apply the OR funciton you made way earlier
combined_odds_summary_adjust= calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR_adjust = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS_adjust = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR_adjust = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH_adjust = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)
combined_odds_summary_adjust
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 134 64 0.478 1
## 2 2 131 68 0.519 1.09
## 3 3 118 80 0.678 1.42
## 4 4 105 94 0.895 1.87
## 5 5 49 150 3.06 6.41
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR_adjust
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 52 16 0.308 1
## 2 2 40 11 0.275 0.894
## 3 3 45 16 0.356 1.16
## 4 4 45 25 0.556 1.81
## 5 5 11 22 2 6.5
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS_adjust
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 56 8 0.143 1
## 2 2 57 11 0.193 1.35
## 3 3 44 13 0.295 2.07
## 4 4 43 17 0.395 2.77
## 5 5 30 9 0.3 2.1
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR_adjust
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 23 30 1.30 1
## 2 2 27 41 1.52 1.16
## 3 3 20 44 2.2 1.69
## 4 4 14 47 3.36 2.57
## 5 5 6 84 14 10.7
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH_adjust
## # A tibble: 5 × 8
## score_adjusted_quinti…¹ count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 3 10 3.33 1
## 2 2 7 5 0.714 0.214
## 3 3 9 7 0.778 0.233
## 4 4 3 5 1.67 0.5
## 5 5 2 35 17.5 5.25
## # ℹ abbreviated name: ¹score_adjusted_quintiles
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
Extract the odds ratios by ancestry for plotting
# Extracting quintile 5 data for each summary
quintile_5_data <- data.frame(
group = c("Total","AFR", "HIS", "EUR", "OTH"),
odds_ratio = c(combined_odds_summary$odds_ratio[5],
combined_odds_summary_AFR$odds_ratio[5],
combined_odds_summary_HIS$odds_ratio[5],
combined_odds_summary_EUR$odds_ratio[5],
combined_odds_summary_OTH$odds_ratio[5]),
lower_ci = c(combined_odds_summary$lower_ci[5],
combined_odds_summary_AFR$lower_ci[5],
combined_odds_summary_HIS$lower_ci[5],
combined_odds_summary_EUR$lower_ci[5],
combined_odds_summary_OTH$lower_ci[5]),
upper_ci = c(combined_odds_summary$upper_ci[5],
combined_odds_summary_AFR$upper_ci[5],
combined_odds_summary_HIS$upper_ci[5],
combined_odds_summary_EUR$upper_ci[5],
combined_odds_summary_OTH$upper_ci[5])
)
# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)
# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)
# Plotting
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups", xaxt = "n", col = "#3C8474")
axis(1, at = 1:5, labels = quintile_5_data$group)
# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci,
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
Now lets compute the odds ratios by 5th quintile and ancestry corrected post-integration of the variables
# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
group = c("Total","AFR", "HIS", "EUR", "OTH"),
odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
combined_odds_summary_AFR_adjust$odds_ratio[5],
combined_odds_summary_HIS_adjust$odds_ratio[5],
combined_odds_summary_EUR_adjust$odds_ratio[5],
combined_odds_summary_OTH_adjust$odds_ratio[5]),
lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
combined_odds_summary_AFR_adjust$lower_ci[5],
combined_odds_summary_HIS_adjust$lower_ci[5],
combined_odds_summary_EUR_adjust$lower_ci[5],
combined_odds_summary_OTH_adjust$lower_ci[5]),
upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
combined_odds_summary_AFR_adjust$upper_ci[5],
combined_odds_summary_HIS_adjust$upper_ci[5],
combined_odds_summary_EUR_adjust$upper_ci[5],
combined_odds_summary_OTH_adjust$upper_ci[5])
)
# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)
# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)
# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)
# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci,
code = 3, angle = 90, length = 0.05, col = "black")
Now lets compute the odds ratios by 5th quintile and ancestry corrected pre-integration of the variables
categorized_combined_data <- categorized_combined_data %>%
mutate(
rank_adjusted = rank(Scores_ANCESTRY_adjusted, ties.method = "first"),
score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
)
#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]
# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)
combined_odds_summary
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 156 42 0.269 1
## 2 2 141 58 0.411 1.53
## 3 3 123 75 0.610 2.26
## 4 4 88 111 1.26 4.69
## 5 5 29 170 5.86 21.8
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 56 11 0.196 1
## 2 2 48 11 0.229 1.17
## 3 3 42 19 0.452 2.30
## 4 4 36 23 0.639 3.25
## 5 5 11 26 2.36 12.0
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS
## # A tibble: 5 × 8
## score_adjusted_quinti…¹ count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 64 6 0.0938 1
## 2 2 66 11 0.167 1.78
## 3 3 54 13 0.241 2.57
## 4 4 38 21 0.553 5.89
## 5 5 8 7 0.875 9.33
## # ℹ abbreviated name: ¹score_adjusted_quintiles
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR
## # A tibble: 5 × 8
## score_adjusted_quinti…¹ count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 29 15 0.517 1
## 2 2 22 33 1.5 2.9
## 3 3 19 39 2.05 3.97
## 4 4 13 59 4.54 8.77
## 5 5 7 100 14.3 27.6
## # ℹ abbreviated name: ¹score_adjusted_quintiles
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_case odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 7 10 1.43 1
## 2 2 5 3 0.6 0.42
## 3 3 8 4 0.5 0.35
## 4 4 1 8 8 5.6
## 5 5 3 37 12.3 8.63
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
group = c("Total","AFR", "HIS", "EUR", "OTH"),
odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
combined_odds_summary_AFR_adjust$odds_ratio[5],
combined_odds_summary_HIS_adjust$odds_ratio[5],
combined_odds_summary_EUR_adjust$odds_ratio[5],
combined_odds_summary_OTH_adjust$odds_ratio[5]),
lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
combined_odds_summary_AFR_adjust$lower_ci[5],
combined_odds_summary_HIS_adjust$lower_ci[5],
combined_odds_summary_EUR_adjust$lower_ci[5],
combined_odds_summary_OTH_adjust$lower_ci[5]),
upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
combined_odds_summary_AFR_adjust$upper_ci[5],
combined_odds_summary_HIS_adjust$upper_ci[5],
combined_odds_summary_EUR_adjust$upper_ci[5],
combined_odds_summary_OTH_adjust$upper_ci[5])
)
# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)
# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)
# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)
# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci,
code = 3, angle = 90, length = 0.05, col = "black")
Plot the ROC for GAPS on the data adjusted for ancestry post-integration
# Convert 'Category' into a binary format: 0 for control, 1 for case
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
roc_result_full_adjusted <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full_adjusted)
## Area under the curve: 0.7589
Plot the ROC for GAPS on the data adjusted for ancestry post-integration on the replication cohort
# Run the regression of the score on PC1 to PC10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
data = filtered_categorized_combined_data)
ancestry_coefficients <- coef(ancestry_regression_model)
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data$adjusted_scores_replication <- categorized_replication_data$scores_replication -
(ancestry_coefficients["(Intercept)"] +
categorized_replication_data$PC1 * ancestry_coefficients["PC1"] +
categorized_replication_data$PC2 * ancestry_coefficients["PC2"] +
categorized_replication_data$PC3 * ancestry_coefficients["PC3"] +
categorized_replication_data$PC4 * ancestry_coefficients["PC4"] +
categorized_replication_data$PC5 * ancestry_coefficients["PC5"] +
categorized_replication_data$PC6 * ancestry_coefficients["PC6"] +
categorized_replication_data$PC7 * ancestry_coefficients["PC7"] +
categorized_replication_data$PC8 * ancestry_coefficients["PC8"] +
categorized_replication_data$PC9 * ancestry_coefficients["PC9"] +
categorized_replication_data$PC10 * ancestry_coefficients["PC10"] )
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data <- categorized_replication_data %>%
mutate(adjusted_probability_replication = plogis(adjusted_scores_replication))
# Compute the ROC curve
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$adjusted_probability_replication)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full_adjusted)
## Area under the curve: 0.7918
# Plot the ROC curve with the x-axis reversed
plot(roc_result_full_adjusted,xlim = c(1, 0))
Perform iterations of 50(5-fold) validation of the discovery cohort and plot the dostribution of outcomes
# number of iterations
n_repeats <- 50
# Store results
auc_results <- data.frame()
for (i in 1:n_repeats) {
set.seed(i)
# cross-validation method
cv_folds <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
# Train models again
cv_model_cardiomyopathy_rare <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_epilepsy_rare <- suppressWarnings(train(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_noncoding_rare <- suppressWarnings(train(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_common <- suppressWarnings(train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_coding_rare <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_epilepsy_and_common <- suppressWarnings(train(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_cardiac_and_common <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_common_and_coding <- suppressWarnings(train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_full <- suppressWarnings(
train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV +
Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare +
PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare +
Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
# Store results for this iteration
temp_results <- data.frame(
Model = c("Cardiomyopathy Rare", "Epilepsy Rare", "Noncoding Rare", "Common Traits", "Coding Rare",
"Epilepsy & Common", "Cardiac & Common", "Common & Coding", "Common, Coding & Noncoding"),
AUC = c(
max(cv_model_cardiomyopathy_rare$results$ROC),
max(cv_model_epilepsy_rare$results$ROC),
max(cv_model_noncoding_rare$results$ROC),
max(cv_model_common$results$ROC),
max(cv_model_coding_rare$results$ROC),
max(cv_model_epilepsy_and_common$results$ROC),
max(cv_model_cardiac_and_common$results$ROC),
max(cv_model_common_and_coding$results$ROC),
max(cv_model_full$results$ROC)
),
Iteration = i
)
auc_results <- rbind(auc_results, temp_results)
}
# WA color palette for Millenial enjoyment
AUC_palette <- c(wes_palette("Royal1"), wes_palette("Royal2"))
# Boxplot AUC distribution
ggplot(auc_results, aes(x = Model, y = AUC, fill = Model)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = AUC_palette) + # Apply custom palette
theme_minimal() +
labs(title = "AUC Distributions Across Models", x = "Model", y = "AUC") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_cowplot()
library(dplyr)
summary_stats <- auc_results %>%
group_by(Model) %>%
summarise(Median_AUC = median(AUC),
IQR_AUC = IQR(AUC))
print(summary_stats)
## # A tibble: 9 × 3
## Model Median_AUC IQR_AUC
## <chr> <dbl> <dbl>
## 1 Cardiac & Common 0.717 0.00474
## 2 Cardiomyopathy Rare 0.683 0.00300
## 3 Coding Rare 0.689 0.00460
## 4 Common & Coding 0.720 0.00697
## 5 Common Traits 0.663 0.00671
## 6 Common, Coding & Noncoding 0.738 0.00610
## 7 Epilepsy & Common 0.684 0.00727
## 8 Epilepsy Rare 0.640 0.00332
## 9 Noncoding Rare 0.581 0.00289
Now down-sample 20 different times and re-run the 5-fold validations so that the ancestries match.
# Initialize a dataframe to store all AUCs
auc_results <- data.frame(
Outer_Fold = integer(),
Inner_Fold = integer(),
AUC = numeric(),
stringsAsFactors = FALSE
)
# Define number of outer folds
outer_folds <- 20
inner_folds <- 5
# Outer loop: 20 iterations
for (i in 1:outer_folds) {
# Create a new randomly downsampled dataset (ensuring balanced case/control per ancestry)
downsampled_data <- bind_rows(
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "EUR", Category == "zCase") %>% slice_sample(n = 90),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "EUR", Category == "Control") %>% slice_sample(n = 90),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "HIS", Category == "zCase") %>% slice_sample(n = 58),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "HIS", Category == "Control") %>% slice_sample(n = 58),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "AFR", Category == "zCase") %>% slice_sample(n = 90),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "AFR", Category == "Control") %>% slice_sample(n = 90),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "OTH", Category == "zCase") %>% slice_sample(n = 24),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "OTH", Category == "Control") %>% slice_sample(n = 24)
)
# Ensure binary outcome is properly coded
downsampled_data$binary_outcome <- ifelse(downsampled_data$Category == "zCase", 1, 0)
# Create 5 stratified folds **by both ancestry and category**
folds <- downsampled_data %>%
group_by(cluster_gmm_Ancestry, Category) %>%
mutate(Fold = sample(rep(1:inner_folds, length.out = n()))) %>%
ungroup()
# Train and test across the 5 folds within this downsampled dataset
inner_auc_values <- numeric(inner_folds)
for (j in 1:inner_folds) {
# Split into training and testing sets
train_data <- folds %>% filter(Fold != j) %>% dplyr::select(-Fold)
test_data <- folds %>% filter(Fold == j) %>% dplyr::select(-Fold)
# Train model
cv_model <- suppressWarnings(
train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV +
Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare +
PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare +
Cardiomyopathy_null + Epilepsy_null,
data = train_data, method = "glm", family = binomial)
)
# Get predicted probabilities for the positive class (zCase)
test_data$predicted_prob <- predict(cv_model, test_data, type = "prob")[, "zCase"]
# Compute ROC curve and AUC
roc_result <- roc(test_data$binary_outcome, test_data$predicted_prob)
inner_auc_values[j] <- auc(roc_result)
# Store the AUC result
auc_results <- rbind(auc_results, data.frame(Outer_Fold = i, Inner_Fold = j, AUC = inner_auc_values[j]))
}
# Store the mean AUC from the 5 inner folds for this outer iteration
auc_results <- rbind(auc_results, data.frame(Outer_Fold = i, Inner_Fold = NA, AUC = mean(inner_auc_values)))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Ensure Inner_Fold is treated as character for filtering
auc_results <- auc_results %>%
mutate(Inner_Fold = as.character(Inner_Fold))
# Extract inner fold AUCs for the histogram
inner_auc_data <- auc_results %>%
filter(Inner_Fold != "Mean") # Exclude previous Mean entries if they exist
# Compute mean AUC per outer fold
outer_auc_means <- inner_auc_data %>%
group_by(Outer_Fold) %>%
summarise(AUC = mean(AUC)) %>%
ungroup()
# Ensure AUC is numeric
inner_auc_data$AUC <- as.numeric(inner_auc_data$AUC)
outer_auc_means$AUC <- as.numeric(outer_auc_means$AUC)
# Print to verify correct means
print(outer_auc_means)
## # A tibble: 20 × 2
## Outer_Fold AUC
## <int> <dbl>
## 1 1 0.668
## 2 2 0.676
## 3 3 0.665
## 4 4 0.676
## 5 5 0.699
## 6 6 0.686
## 7 7 0.678
## 8 8 0.684
## 9 9 0.675
## 10 10 0.682
## 11 11 0.652
## 12 12 0.682
## 13 13 0.673
## 14 14 0.604
## 15 15 0.669
## 16 16 0.672
## 17 17 0.676
## 18 18 0.682
## 19 19 0.681
## 20 20 0.676
Now plot the distribution of all the down-sampled, ancestry-matched AUC results, layering the means of each outer fold on top
# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(downsampled_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
xlab("Category") +
ylab("Percentage") +
theme(legend.position = "bottom") +
scale_fill_manual(values = palette_colors) +
theme_cowplot(12)
ancestry_makeup
# Create histogram with cowplot styling
auc_histogram <- ggplot(inner_auc_data, aes(x = AUC)) +
geom_histogram(binwidth = 0.01, fill = "lightblue", color = "black") +
geom_vline(data = outer_auc_means, aes(xintercept = AUC), color = "black", linetype = "dashed", size = 0.5) +
geom_vline(xintercept = 0.5, color = "red", linetype = "solid", size = 1) +
xlab("AUC") +
ylab("Frequency") +
ggtitle("Histogram of Inner Fold AUCs with Outer Fold Means") +
theme_cowplot(12)
auc_histogram
# Compute mean of outer fold AUCs
mean_outer_auc <- outer_auc_means %>%
summarise(Mean_AUC = mean(AUC)) %>%
pull(Mean_AUC)
# Print the mean AUC value
print(paste("Mean AUC across outer folds:", mean_outer_auc))
## [1] "Mean AUC across outer folds: 0.672768927539058"
# Load the reporter data
reporter_data <- read.delim("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/Reporter_analysis.txt", check.names = FALSE)
reporter_long <- reporter_data %>%
pivot_longer(cols = everything(), names_to = "Condition", values_to = "Activity")
library(ggplot2)
ggplot(reporter_long, aes(x = Condition, y = Activity, fill = Condition)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.6, color = "black") +
theme_minimal(base_size = 14) +
theme(legend.position = "none") +
ylab("Reporter Activity") +
xlab("") +
coord_cartesian(ylim = c(0, max(reporter_long$Activity, na.rm = TRUE) * 1.1)) +
ggtitle("Reporter Assay Activity by Condition")